org.apache.pig.piggybank.storage.FixedWidthStorer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.piggybank.storage.FixedWidthStorer.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.piggybank.storage;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.PigWarning;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;

import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;

/**
 * Stores Pig records in a fixed-width file format. 
 * 
 * Takes a string argument specifying the ranges of each column in a unix 'cut'-like format.
 * Ex: '-5, 10-12, 14, 20-'
 * Ranges are comma-separated, 1-indexed (for ease of use with 1-indexed text editors), and inclusive.
 * A single-column field at position n may be specified as either 'n-n' or simply 'n'.
 *
 * A second optional argument specifies whether to write a header record
 * with the names of each field. 'WRITE_HEADER' writes a header record;
 * 'NO_HEADER' and the default does not write one.
 *
 * All datetimes are stored in UTC.
 *
 * Column spec idea and syntax parser borrowed from Russ Lankenau's FixedWidthLoader implementation
 * at https://github.com/rlankenau/fixed-width-pig-loader 
 */
public class FixedWidthStorer extends StoreFunc {

    private TupleFactory tupleFactory = TupleFactory.getInstance();

    private RecordWriter writer = null;

    private ArrayList<FixedWidthLoader.FixedWidthField> columns;

    private ResourceSchema schema = null;
    private ResourceFieldSchema[] fields;

    private boolean writingFirstRecord = true;
    private boolean writeHeader = false;

    private String udfContextSignature = null;
    private static final String SCHEMA_SIGNATURE = "pig.fixedwidthloader.schema";
    private static final Log log = LogFactory.getLog(FixedWidthStorer.class);

    /*
     * Constructors and contructor helper methods
     */

    public FixedWidthStorer() {
        throw new IllegalArgumentException("Usage: org.apache.pig.piggybank.storage.FixedWidthStorer("
                + "'<column spec>'[, { 'WRITE_HEADER' | 'NO_HEADER' }]" + ")");
    }

    public FixedWidthStorer(String columnSpec) {
        columns = FixedWidthLoader.parseColumnSpec(columnSpec);
    }

    public FixedWidthStorer(String columnSpec, String headerStr) {
        this(columnSpec);

        if (headerStr.equalsIgnoreCase("WRITE_HEADER"))
            writeHeader = true;
    }

    /*
     * Methods called on the frontend
     */

    @Override
    public OutputFormat getOutputFormat() throws IOException {
        // Key is unused, Text is where the data is stored in
        return new TextOutputFormat<LongWritable, Text>();
    }

    @Override
    public void setStoreLocation(String location, Job job) throws IOException {
        FileOutputFormat.setOutputPath(job, new Path(location));
    }

    @Override
    public void setStoreFuncUDFContextSignature(String signature) {
        udfContextSignature = signature;
    }

    @Override
    public void checkSchema(ResourceSchema s) throws IOException {
        // Not actually checking schema
        // Just storing it to use in the backend

        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfContextSignature });
        p.setProperty(SCHEMA_SIGNATURE, s.toString());
    }

    /*
     * Methods called on the backend
     */

    @Override
    public void prepareToWrite(RecordWriter writer) throws IOException {
        // Store writer to use in putNext()
        this.writer = writer;

        // Get the schema string from the UDFContext object.
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfContextSignature });
        String strSchema = p.getProperty(SCHEMA_SIGNATURE);
        if (strSchema == null) {
            throw new IOException("Could not find schema in UDF context");
        }

        schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        fields = schema.getFields();
    }

    @Override
    @SuppressWarnings("unchecked")
    public void putNext(Tuple t) throws IOException {

        // Write header row if this is the first record

        StringBuilder sb = new StringBuilder();
        FixedWidthLoader.FixedWidthField column;
        int offset = 0;

        if (writingFirstRecord && writeHeader) {
            for (int i = 0; i < fields.length; i++) {
                column = columns.get(i);
                sb.append(writeFieldAsString(fields[i], column, offset, fields[i].getName()));
                offset = column.end;
            }

            try {
                writer.write(null, new Text(sb.toString()));
            } catch (InterruptedException ie) {
                throw new IOException(ie);
            }
        }
        writingFirstRecord = false;

        sb = new StringBuilder();
        offset = 0;
        for (int i = 0; i < fields.length; i++) {
            column = columns.get(i);
            sb.append(writeFieldAsString(fields[i], column, offset, t.get(i)));
            offset = column.end;
        }

        try {
            writer.write(null, new Text(sb.toString()));
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        }
    }

    @SuppressWarnings("unchecked")
    private String writeFieldAsString(ResourceFieldSchema field, FixedWidthLoader.FixedWidthField column,
            int offset, Object d) throws IOException {

        StringBuilder sb = new StringBuilder();

        if (offset < column.start) {
            int spaces = column.start - offset;
            for (int i = 0; i < spaces; i++) {
                sb.append(' ');
            }
        }

        int width = column.end - column.start;
        String fieldStr = null;
        if (d != null) {
            if (DataType.findType(d) == DataType.DATETIME)
                fieldStr = ((DateTime) d).toDateTime(DateTimeZone.UTC).toString();
            else
                fieldStr = d.toString();
        }

        // write nulls as spaces
        if (fieldStr == null) {
            for (int i = 0; i < width; i++) {
                sb.append(' ');
            }
            return sb.toString();
        }

        // If the field is too big to fit in column
        if (fieldStr.length() > width) {
            // If it is float or double, try to round it to fit
            byte fieldType = field.getType();
            if (fieldType == DataType.FLOAT || fieldType == DataType.DOUBLE) {
                double doubleVal = ((Number) d).doubleValue();
                int numDigitsLeftOfDecimal = (int) Math.ceil(Math.log10(Math.abs(doubleVal)));

                // Field can be rounded to fit
                if (numDigitsLeftOfDecimal <= width + 2) {
                    int numDigitsRightOfDecimal = width - numDigitsLeftOfDecimal - 1; // should be at least 1
                    String truncated = String.format("%." + numDigitsRightOfDecimal + "f", doubleVal);

                    warn("Cannot fit " + fieldStr + " in field starting at column " + column.start
                            + " and ending at column " + (column.end - 1) + ". "
                            + "Since the field is a decimal type, truncating it to " + truncated + " "
                            + "to fit in the column.", PigWarning.UDF_WARNING_1);
                    sb.append(truncated);
                } else {
                    // Field is float or double but cannot be rounded to fit
                    warn("Cannot fit " + fieldStr + " in field starting at column " + column.start
                            + " and ending at column " + (column.end - 1) + ". "
                            + "Writing null (all spaces) instead.", PigWarning.UDF_WARNING_2);
                    for (int i = 0; i < width; i++) {
                        sb.append(' ');
                    }
                }
            } else {
                warn("Cannot fit " + fieldStr + " in field starting at column " + column.start
                        + " and ending at column " + (column.end - 1) + ". " + "Writing null (all spaces) instead.",
                        PigWarning.UDF_WARNING_2);
                for (int i = 0; i < width; i++) {
                    sb.append(' ');
                }
            }
        } else {
            // Field can fit. Right-justify it.
            int spaces = width - fieldStr.length();
            for (int i = 0; i < spaces; i++) {
                sb.append(' ');
            }
            sb.append(fieldStr);
        }

        return sb.toString();
    }

    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        // Not implemented
        return null;
    }

    public void storeStatistics(ResourceStatistics stats, String location, Job job) throws IOException {
        // Not implemented
    }

    public String[] getPartitionKeys(String location, Job job) throws IOException {
        // Not implemented
        return null;
    }

    public void setPartitionFilter(Expression partitionFilter) throws IOException {
        // Not implemented
    }
}