org.apache.tajo.storage.regex.RegexLineDeserializer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tajo.storage.regex.RegexLineDeserializer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.storage.regex;

import io.netty.buffer.ByteBuf;
import io.netty.util.CharsetUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.datum.Datum;
import org.apache.tajo.datum.NullDatum;
import org.apache.tajo.exception.InvalidTablePropertyException;
import org.apache.tajo.exception.TajoRuntimeException;
import org.apache.tajo.plan.util.PlannerUtil;
import org.apache.tajo.storage.FieldSerializerDeserializer;
import org.apache.tajo.storage.StorageConstants;
import org.apache.tajo.storage.Tuple;
import org.apache.tajo.storage.text.TextFieldSerializerDeserializer;
import org.apache.tajo.storage.text.TextLineDeserializer;
import org.apache.tajo.storage.text.TextLineParsingError;
import org.apache.tajo.storage.text.TextLineSerDe;

import java.io.IOException;
import java.nio.charset.CharsetDecoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexLineDeserializer extends TextLineDeserializer {
    private static final Log LOG = LogFactory.getLog(RegexLineDeserializer.class);

    private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8);
    private FieldSerializerDeserializer fieldSerDer;
    private ByteBuf nullChars;

    private int[] targetColumnIndexes;
    private String inputRegex;
    private Pattern inputPattern;
    // Number of rows not matching the regex
    private long unmatchedRows = 0;
    private long nextUnmatchedRows = 1;
    // Number of rows that match the regex but have missing groups.
    private long partialMatchedRows = 0;
    private long nextPartialMatchedRows = 1;

    public RegexLineDeserializer(Schema schema, TableMeta meta, Column[] projected) {
        super(schema, meta);
        targetColumnIndexes = PlannerUtil.getTargetIds(schema, projected);
    }

    @Override
    public void init() {
        fieldSerDer = new TextFieldSerializerDeserializer(meta);
        fieldSerDer.init(schema);

        // Read the configuration parameters
        inputRegex = meta.getProperty(StorageConstants.TEXT_REGEX);
        boolean inputRegexIgnoreCase = "true"
                .equalsIgnoreCase(meta.getProperty(StorageConstants.TEXT_REGEX_CASE_INSENSITIVE, "false"));

        // Parse the configuration parameters
        if (inputRegex != null) {
            inputPattern = Pattern.compile(inputRegex,
                    Pattern.DOTALL + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0));
        } else {
            throw new TajoRuntimeException(new InvalidTablePropertyException(StorageConstants.TEXT_REGEX,
                    "This table does not have serde property \"" + StorageConstants.TEXT_REGEX + "\"!"));
        }

        if (nullChars != null) {
            nullChars.release();
        }
        nullChars = TextLineSerDe.getNullChars(meta);
    }

    @Override
    public void deserialize(final ByteBuf lineBuf, Tuple output) throws IOException, TextLineParsingError {

        if (lineBuf == null || targetColumnIndexes.length == 0) {
            return;
        }

        String line = decoder.decode(lineBuf.nioBuffer(lineBuf.readerIndex(), lineBuf.readableBytes())).toString();
        int[] projection = targetColumnIndexes;

        // Projection
        int currentTarget = 0;
        int currentIndex = 0;
        Matcher m = inputPattern.matcher(line);

        if (!m.matches()) {
            unmatchedRows++;
            if (unmatchedRows >= nextUnmatchedRows) {
                nextUnmatchedRows *= 100;
                // Report the row
                LOG.warn("" + unmatchedRows + " unmatched rows are found: " + line);
            }
        } else {

            int groupCount = m.groupCount();
            int currentGroup = 1;
            while (currentGroup <= groupCount) {

                if (projection.length > currentTarget && currentIndex == projection[currentTarget]) {

                    try {
                        Datum datum = fieldSerDer.deserialize(currentIndex,
                                lineBuf.setIndex(m.start(currentGroup), m.end(currentGroup)), nullChars);

                        output.put(currentTarget, datum);
                    } catch (Exception e) {
                        partialMatchedRows++;
                        if (partialMatchedRows >= nextPartialMatchedRows) {
                            nextPartialMatchedRows *= 100;
                            // Report the row
                            LOG.warn("" + partialMatchedRows + " partially unmatched rows are found, "
                                    + " cannot find group " + currentIndex + ": " + line);
                        }
                        output.put(currentTarget, NullDatum.get());
                    }
                    currentTarget++;
                }

                if (projection.length == currentTarget) {
                    break;
                }

                currentIndex++;
                currentGroup++;
            }
        }

        /* If a text row is less than table schema size, tuple should set to NullDatum */
        if (projection.length > currentTarget) {
            for (; currentTarget < projection.length; currentTarget++) {
                output.put(currentTarget, NullDatum.get());
            }
        }
    }

    @Override
    public void release() {
        if (nullChars != null) {
            nullChars.release();
            nullChars = null;
        }
    }
}