net.iponweb.hadoop.streaming.parquet.TextRecordWriterWrapper.java Source code

Java tutorial

Introduction

Here is the source code for net.iponweb.hadoop.streaming.parquet.TextRecordWriterWrapper.java

Source

/**
 * Copyright 2014 IPONWEB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package net.iponweb.hadoop.streaming.parquet;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetRecordWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Stack;

public class TextRecordWriterWrapper implements RecordWriter<Text, Text> {

    protected ParquetRecordWriter<SimpleGroup> realWriter;
    protected MessageType schema;
    protected SimpleGroupFactory factory;
    private static final String TAB = "\t";
    protected ArrayList<PathAction> recorder;

    TextRecordWriterWrapper(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf conf, String name,
            Progressable progress) throws IOException {

        realWriter = w;
        schema = GroupWriteSupport.getSchema(conf);
        factory = new SimpleGroupFactory(schema);

        recorder = new ArrayList<>();
        ArrayList<String[]> Paths = (ArrayList<String[]>) schema.getPaths();
        Iterator<String[]> pi = Paths.listIterator();

        String[] prevPath = {};

        short grpDepth = 0;
        while (pi.hasNext()) {

            String p[] = pi.next();

            // Find longest common path between prev_path and current
            ArrayList<String> commonPath = new ArrayList<String>();
            for (int n = 0; n < prevPath.length; n++) {
                if (n < p.length && p[n].equals(prevPath[n])) {
                    commonPath.add(p[n]);
                } else
                    break;
            }

            // If current element is not inside previous group, restore to the group of common path
            for (int n = commonPath.size(); n < prevPath.length - 1; n++) {
                recorder.add(new PathAction(PathAction.ActionType.GROUPEND));
                grpDepth--;
            }

            // If current element is not right after common path, create all required groups
            for (int n = commonPath.size(); n < p.length - 1; n++) {
                PathAction a = new PathAction(PathAction.ActionType.GROUPSTART);
                a.setName(p[n]);
                recorder.add(a);
                grpDepth++;
            }

            prevPath = p;

            PathAction a = new PathAction(PathAction.ActionType.FIELD);

            Type colType = schema.getType(p);

            a.setType(colType.asPrimitiveType().getPrimitiveTypeName());
            a.setRepetition(colType.getRepetition());
            a.setName(p[p.length - 1]);

            recorder.add(a);
        }

        // Close trailing groups
        while (grpDepth-- > 0)
            recorder.add(new PathAction(PathAction.ActionType.GROUPEND));
    }

    @Override
    public void close(Reporter reporter) throws IOException {
        try {
            realWriter.close(null);
        } catch (InterruptedException e) {
            Thread.interrupted();
            throw new IOException(e);
        }
    }

    @Override
    public void write(Text key, Text value) throws IOException {

        Group grp = factory.newGroup();

        String[] strK = key.toString().split(TAB, -1);
        String[] strV = value == null ? new String[0] : value.toString().split(TAB, -1);
        String kv_combined[] = (String[]) ArrayUtils.addAll(strK, strV);

        Iterator<PathAction> ai = recorder.iterator();

        Stack<Group> groupStack = new Stack<>();
        groupStack.push(grp);
        int i = 0;

        try {
            while (ai.hasNext()) {

                PathAction a = ai.next();
                switch (a.getAction()) {
                case GROUPEND:
                    grp = groupStack.pop();
                    break;

                case GROUPSTART:
                    groupStack.push(grp);
                    grp = grp.addGroup(a.getName());
                    break;

                case FIELD:
                    String s = null;
                    PrimitiveType.PrimitiveTypeName primType = a.getType();
                    String colName = a.getName();

                    if (i < kv_combined.length)
                        s = kv_combined[i++];

                    if (s == null) {
                        if (a.getRepetition() == Type.Repetition.OPTIONAL) {
                            i++;
                            continue;
                        }
                        s = primType == PrimitiveType.PrimitiveTypeName.BINARY ? "" : "0";
                    }

                    // If we have 'repeated' field, assume that we should expect JSON-encoded array
                    // Convert array and append all values
                    int repetition = 1;
                    boolean repeated = false;
                    ArrayList<String> s_vals = null;
                    if (a.getRepetition() == Type.Repetition.REPEATED) {
                        repeated = true;
                        s_vals = new ArrayList<>();
                        ObjectMapper mapper = new ObjectMapper();
                        JsonNode node = mapper.readTree(s);
                        Iterator<JsonNode> itr = node.iterator();
                        repetition = 0;
                        while (itr.hasNext()) {

                            String o;
                            switch (primType) {
                            case BINARY:
                                o = itr.next().getTextValue(); // No array-of-objects!
                                break;
                            case BOOLEAN:
                                o = String.valueOf(itr.next().getBooleanValue());
                                break;
                            default:
                                o = String.valueOf(itr.next().getNumberValue());
                            }

                            s_vals.add(o);
                            repetition++;
                        }
                    }

                    for (int j = 0; j < repetition; j++) {

                        if (repeated)
                            // extract new s
                            s = s_vals.get(j);

                        try {
                            switch (primType) {

                            case INT32:
                                grp.append(colName, new Double(Double.parseDouble(s)).intValue());
                                break;
                            case INT64:
                            case INT96:
                                grp.append(colName, new Double(Double.parseDouble(s)).longValue());
                                break;
                            case DOUBLE:
                                grp.append(colName, Double.parseDouble(s));
                                break;
                            case FLOAT:
                                grp.append(colName, Float.parseFloat(s));
                                break;
                            case BOOLEAN:
                                grp.append(colName, s.equalsIgnoreCase("true") || s.equals("1"));
                                break;
                            case BINARY:
                                grp.append(colName, Binary.fromString(s));
                                break;
                            default:
                                throw new RuntimeException("Can't handle type " + primType);
                            }
                        } catch (NumberFormatException e) {

                            grp.append(colName, 0);
                        }
                    }
                }
            }

            realWriter.write(null, (SimpleGroup) grp);

        } catch (InterruptedException e) {
            Thread.interrupted();
            throw new IOException(e);
        } catch (Exception e) {
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            e.printStackTrace(new PrintStream(out));
            throw new RuntimeException("Failed on record " + grp + ", schema=" + schema + ", path action="
                    + recorder + " exception = " + e.getClass() + ", msg=" + e.getMessage() + ", cause="
                    + e.getCause() + ", trace=" + out.toString());
        }

    }
}