org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities.java Source code

Introduction

Here is the source code for org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities.java
Source

package org.apache.hawq.pxf.plugins.hdfs.utilities;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.hawq.pxf.service.utilities.Utilities;
import org.apache.hawq.pxf.api.io.DataType;
import org.apache.hawq.pxf.api.OneField;
import org.apache.hawq.pxf.api.utilities.InputData;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.util.ReflectionUtils;

import java.io.*;
import java.util.List;

/**
 * HdfsUtilities class exposes helper methods for PXF classes.
 */
public class HdfsUtilities {
    private static final Log LOG = LogFactory.getLog(HdfsUtilities.class);
    private static Configuration config = new Configuration();
    private static CompressionCodecFactory factory = new CompressionCodecFactory(config);

    /**
     * Hdfs data sources are absolute data paths. Method ensures that dataSource
     * begins with '/'.
     *
     * @param dataSource The HDFS path to a file or directory of interest.
     *            Retrieved from the client request.
     * @return an absolute data path
     */
    public static String absoluteDataPath(String dataSource) {
        return (dataSource.charAt(0) == '/') ? dataSource : "/" + dataSource;
    }

    /*
     * Helper routine to get a compression codec class
     */
    private static Class<? extends CompressionCodec> getCodecClass(Configuration conf, String name) {

        Class<? extends CompressionCodec> codecClass;
        try {
            codecClass = conf.getClassByName(name).asSubclass(CompressionCodec.class);
        } catch (ClassNotFoundException e) {
            throw new IllegalArgumentException("Compression codec " + name + " was not found.", e);
        }
        return codecClass;
    }

    /**
     * Helper routine to get compression codec through reflection.
     *
     * @param conf configuration used for reflection
     * @param name codec name
     * @return generated CompressionCodec
     */
    public static CompressionCodec getCodec(Configuration conf, String name) {
        return ReflectionUtils.newInstance(getCodecClass(conf, name), conf);
    }

    /**
     * Helper routine to get compression codec class by path (file suffix).
     *
     * @param path path of file to get codec for
     * @return matching codec class for the path. null if no codec is needed.
     */
    private static Class<? extends CompressionCodec> getCodecClassByPath(String path) {

        Class<? extends CompressionCodec> codecClass = null;
        CompressionCodec codec = factory.getCodec(new Path(path));
        if (codec != null) {
            codecClass = codec.getClass();
        }
        LOG.debug((codecClass == null ? "No codec" : "Codec " + codecClass) + " was found for file " + path);
        return codecClass;
    }

    /**
     * Returns true if the needed codec is splittable. If no codec is needed
     * returns true as well.
     *
     * @param path path of the file to be read
     * @return if the codec needed for reading the specified path is splittable.
     */
    public static boolean isSplittableCodec(Path path) {

        final CompressionCodec codec = factory.getCodec(path);
        if (null == codec) {
            return true;
        }

        return codec instanceof SplittableCompressionCodec;
    }

    /**
     * Checks if requests should be handle in a single thread or not.
     *
     * @param dataDir hdfs path to the data source
     * @param compCodec the fully qualified name of the compression codec
     * @return if the request can be run in multi-threaded mode.
     */
    public static boolean isThreadSafe(String dataDir, String compCodec) {

        Class<? extends CompressionCodec> codecClass = (compCodec != null)
                ? HdfsUtilities.getCodecClass(config, compCodec)
                : HdfsUtilities.getCodecClassByPath(dataDir);
        /* bzip2 codec is not thread safe */
        return (codecClass == null || !BZip2Codec.class.isAssignableFrom(codecClass));
    }

    /**
     * Prepares byte serialization of a file split information (start, length,
     * hosts) using {@link ObjectOutputStream}.
     *
     * @param fsp file split to be serialized
     * @return byte serialization of fsp
     * @throws IOException if I/O errors occur while writing to the underlying
     *             stream
     */
    public static byte[] prepareFragmentMetadata(FileSplit fsp) throws IOException {
        ByteArrayOutputStream byteArrayStream = new ByteArrayOutputStream();
        ObjectOutputStream objectStream = new ObjectOutputStream(byteArrayStream);
        objectStream.writeLong(fsp.getStart());
        objectStream.writeLong(fsp.getLength());
        objectStream.writeObject(fsp.getLocations());

        return byteArrayStream.toByteArray();
    }

    /**
     * Parses fragment metadata and return matching {@link FileSplit}.
     *
     * @param inputData request input data
     * @return FileSplit with fragment metadata
     */
    public static FileSplit parseFragmentMetadata(InputData inputData) {
        try {
            byte[] serializedLocation = inputData.getFragmentMetadata();
            if (serializedLocation == null) {
                throw new IllegalArgumentException("Missing fragment location information");
            }

            ByteArrayInputStream bytesStream = new ByteArrayInputStream(serializedLocation);
            ObjectInputStream objectStream = new ObjectInputStream(bytesStream);

            long start = objectStream.readLong();
            long end = objectStream.readLong();

            String[] hosts = (String[]) objectStream.readObject();

            FileSplit fileSplit = new FileSplit(new Path(inputData.getDataSource()), start, end, hosts);

            LOG.debug("parsed file split: path " + inputData.getDataSource() + ", start " + start + ", end " + end
                    + ", hosts " + ArrayUtils.toString(hosts));

            return fileSplit;

        } catch (Exception e) {
            throw new RuntimeException("Exception while reading expected fragment metadata", e);
        }
    }

    /**
     * Accessing the Avro file through the "unsplittable" API just to get the
     * schema. The splittable API (AvroInputFormat) which is the one we will be
     * using to fetch the records, does not support getting the Avro schema yet.
     *
     * @param conf Hadoop configuration
     * @param dataSource Avro file (i.e fileName.avro) path
     * @return the Avro schema
     * @throws IOException if I/O error occurred while accessing Avro schema file
     */
    public static Schema getAvroSchema(Configuration conf, String dataSource) throws IOException {
        FsInput inStream = new FsInput(new Path(dataSource), conf);
        DatumReader<GenericRecord> dummyReader = new GenericDatumReader<>();
        DataFileReader<GenericRecord> dummyFileReader = new DataFileReader<>(inStream, dummyReader);
        Schema schema = dummyFileReader.getSchema();
        dummyFileReader.close();
        return schema;
    }

    /**
     * Returns string serialization of list of fields. Fields of binary type
     * (BYTEA) are converted to octal representation to make sure they will be
     * relayed properly to the DB.
     *
     * @param complexRecord list of fields to be stringified
     * @param delimiter delimiter between fields
     * @return string of serialized fields using delimiter
     */
    public static String toString(List<OneField> complexRecord, String delimiter) {
        StringBuilder buff = new StringBuilder();
        String delim = ""; // first iteration has no delimiter
        for (OneField complex : complexRecord) {
            if (complex.type == DataType.BYTEA.getOID()) {
                /** Serialize byte array as string */
                buff.append(delim);
                Utilities.byteArrayToOctalString((byte[]) complex.val, buff);
            } else {
                buff.append(delim).append(complex.val);
            }
            delim = delimiter;
        }
        return buff.toString();
    }
}