org.apache.hadoop.hive.ql.QOutProcessor.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.QOutProcessor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hive.ql.QTestUtil.FsType;

/**
 * QOutProcessor: produces the final q.out from original q.out by postprocessing (e.g. masks)
 *
 */
public class QOutProcessor {
    public static final String PATH_HDFS_REGEX = "(hdfs://)([a-zA-Z0-9:/_\\-\\.=])+";
    public static final String PATH_HDFS_WITH_DATE_USER_GROUP_REGEX = "([a-z]+) ([a-z]+)([ ]+)([0-9]+) ([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}) "
            + PATH_HDFS_REGEX;

    public static final String HDFS_MASK = "### HDFS PATH ###";
    public static final String HDFS_DATE_MASK = "### HDFS DATE ###";
    public static final String HDFS_USER_MASK = "### USER ###";
    public static final String HDFS_GROUP_MASK = "### GROUP ###";

    public static final String MASK_PATTERN = "#### A masked pattern was here ####";
    public static final String PARTIAL_MASK_PATTERN = "#### A PARTIAL masked pattern was here ####";

    private final Set<String> qMaskStatsQuerySet = new HashSet<String>();
    private final Set<String> qMaskDataSizeQuerySet = new HashSet<String>();
    private final Set<String> qMaskLineageQuerySet = new HashSet<String>();

    private static final PatternReplacementPair MASK_STATS = new PatternReplacementPair(
            Pattern.compile(" Num rows: [1-9][0-9]* Data size: [1-9][0-9]*"),
            " Num rows: ###Masked### Data size: ###Masked###");
    private static final PatternReplacementPair MASK_DATA_SIZE = new PatternReplacementPair(
            Pattern.compile(" Data size: [1-9][0-9]*"), " Data size: ###Masked###");
    private static final PatternReplacementPair MASK_LINEAGE = new PatternReplacementPair(
            Pattern.compile("POSTHOOK: Lineage: .*"), "POSTHOOK: Lineage: ###Masked###");

    private static final Pattern PATTERN_MASK_STATS = Pattern.compile("-- MASK_STATS");
    private static final Pattern PATTERN_MASK_DATA_SIZE = Pattern.compile("-- MASK_DATA_SIZE");
    private static final Pattern PATTERN_MASK_LINEAGE = Pattern.compile("-- MASK_LINEAGE");

    private FsType fsType = FsType.local;

    public static class LineProcessingResult {
        private String line;
        private boolean partialMaskWasMatched = false;

        public LineProcessingResult(String line) {
            this.line = line;
        }

        public String get() {
            return line;
        }
    }

    private final Pattern[] planMask = toPattern(new String[] { ".*file:.*", ".*pfile:.*", ".*/tmp/.*",
            ".*invalidscheme:.*", ".*lastUpdateTime.*", ".*lastAccessTime.*", ".*lastModifiedTime.*",
            ".*[Oo]wner.*", ".*CreateTime.*", ".*LastAccessTime.*", ".*Location.*", ".*LOCATION '.*",
            ".*transient_lastDdlTime.*", ".*last_modified_.*", ".*at org.*", ".*at sun.*", ".*at java.*",
            ".*at junit.*", ".*Caused by:.*", ".*LOCK_QUERYID:.*", ".*LOCK_TIME:.*", ".*grantTime.*",
            ".*[.][.][.] [0-9]* more.*", ".*job_[0-9_]*.*", ".*job_local[0-9_]*.*", ".*USING 'java -cp.*",
            "^Deleted.*", ".*DagName:.*", ".*DagId:.*", ".*Input:.*/data/files/.*", ".*Output:.*/data/files/.*",
            ".*total number of created files now is.*", ".*.hive-staging.*", "pk_-?[0-9]*_[0-9]*_[0-9]*",
            "fk_-?[0-9]*_[0-9]*_[0-9]*", "uk_-?[0-9]*_[0-9]*_[0-9]*", "nn_-?[0-9]*_[0-9]*_[0-9]*", // not null constraint name
            "dc_-?[0-9]*_[0-9]*_[0-9]*", // default constraint name
            ".*at com\\.sun\\.proxy.*", ".*at com\\.jolbox.*", ".*at com\\.zaxxer.*",
            "org\\.apache\\.hadoop\\.hive\\.metastore\\.model\\.MConstraint@([0-9]|[a-z])*",
            "^Repair: Added partition to metastore.*", "^latestOffsets.*", "^minimumLag.*" });

    public QOutProcessor(FsType fsType) {
        this.fsType = fsType;
    }

    public QOutProcessor() {
        this.fsType = FsType.hdfs;
    }

    private Pattern[] toPattern(String[] patternStrs) {
        Pattern[] patterns = new Pattern[patternStrs.length];
        for (int i = 0; i < patternStrs.length; i++) {
            patterns[i] = Pattern.compile(patternStrs[i]);
        }
        return patterns;
    }

    public void maskPatterns(String fname, String tname) throws Exception {

        String line;
        BufferedReader in;
        BufferedWriter out;

        File file = new File(fname);
        File fileOrig = new File(fname + ".orig");
        FileUtils.copyFile(file, fileOrig);

        in = new BufferedReader(new InputStreamReader(new FileInputStream(fileOrig), "UTF-8"));
        out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));

        boolean lastWasMasked = false;

        while (null != (line = in.readLine())) {
            LineProcessingResult result = processLine(line, tname);

            if (result.line.equals(MASK_PATTERN)) {
                // We're folding multiple masked lines into one.
                if (!lastWasMasked) {
                    out.write(result.line);
                    out.write("\n");
                    lastWasMasked = true;
                    result.partialMaskWasMatched = false;
                }
            } else {
                out.write(result.line);
                out.write("\n");
                lastWasMasked = false;
                result.partialMaskWasMatched = false;
            }
        }

        in.close();
        out.close();
    }

    public LineProcessingResult processLine(String line, String tname) {
        LineProcessingResult result = new LineProcessingResult(line);

        Matcher matcher = null;

        if (fsType == FsType.encrypted_hdfs) {
            for (Pattern pattern : partialReservedPlanMask) {
                matcher = pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = PARTIAL_MASK_PATTERN + " " + matcher.group(0);
                    result.partialMaskWasMatched = true;
                    break;
                }
            }
        } else {
            for (PatternReplacementPair prp : partialPlanMask) {
                matcher = prp.pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = result.line.replaceAll(prp.pattern.pattern(), prp.replacement);
                    result.partialMaskWasMatched = true;
                }
            }
        }

        if (!result.partialMaskWasMatched) {
            for (Pair<Pattern, String> pair : patternsWithMaskComments) {
                Pattern pattern = pair.getLeft();
                String maskComment = pair.getRight();

                matcher = pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = matcher.replaceAll(maskComment);
                    result.partialMaskWasMatched = true;
                    break;
                }
            }

            if (!result.partialMaskWasMatched && qMaskStatsQuerySet.contains(tname)) {
                matcher = MASK_STATS.pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = result.line.replaceAll(MASK_STATS.pattern.pattern(), MASK_STATS.replacement);
                    result.partialMaskWasMatched = true;
                }
            }

            if (!result.partialMaskWasMatched && qMaskDataSizeQuerySet.contains(tname)) {
                matcher = MASK_DATA_SIZE.pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = result.line.replaceAll(MASK_DATA_SIZE.pattern.pattern(),
                            MASK_DATA_SIZE.replacement);
                    result.partialMaskWasMatched = true;
                }
            }

            if (!result.partialMaskWasMatched && qMaskLineageQuerySet.contains(tname)) {
                matcher = MASK_LINEAGE.pattern.matcher(result.line);
                if (matcher.find()) {
                    result.line = result.line.replaceAll(MASK_LINEAGE.pattern.pattern(), MASK_LINEAGE.replacement);
                    result.partialMaskWasMatched = true;
                }
            }

            for (Pattern pattern : planMask) {
                result.line = pattern.matcher(result.line).replaceAll(MASK_PATTERN);
            }
        }

        return result;
    }

    private final Pattern[] partialReservedPlanMask = toPattern(
            new String[] { "data/warehouse/(.*?/)+\\.hive-staging" // the directory might be db/table/partition
            //TODO: add more expected test result here
            });

    /**
     * Pattern to match and (partial) replacement text.
     * For example, {"transaction":76,"bucketid":8249877}.  We just want to mask 76 but a regex that
     * matches just 76 will match a lot of other things.
     */
    private final static class PatternReplacementPair {
        private final Pattern pattern;
        private final String replacement;

        PatternReplacementPair(Pattern p, String r) {
            pattern = p;
            replacement = r;
        }
    }

    private final PatternReplacementPair[] partialPlanMask;
    {
        ArrayList<PatternReplacementPair> ppm = new ArrayList<>();
        ppm.add(new PatternReplacementPair(Pattern.compile("\\{\"writeid\":[1-9][0-9]*,\"bucketid\":"),
                "{\"writeid\":### Masked writeid ###,\"bucketid\":"));

        ppm.add(new PatternReplacementPair(Pattern.compile("attempt_[0-9_]+"), "attempt_#ID#"));
        ppm.add(new PatternReplacementPair(Pattern.compile("vertex_[0-9_]+"), "vertex_#ID#"));
        ppm.add(new PatternReplacementPair(Pattern.compile("task_[0-9_]+"), "task_#ID#"));
        ppm.add(new PatternReplacementPair(Pattern.compile("for Spark session.*?:"), "#SPARK_SESSION_ID#:"));
        partialPlanMask = ppm.toArray(new PatternReplacementPair[ppm.size()]);
    }
    /* This list may be modified by specific cli drivers to mask strings that change on every test */
    @SuppressWarnings("serial")
    private final List<Pair<Pattern, String>> patternsWithMaskComments = new ArrayList<Pair<Pattern, String>>() {
        {
            add(toPatternPair("(pblob|s3.?|swift|wasb.?).*hive-staging.*", "### BLOBSTORE_STAGING_PATH ###"));
            add(toPatternPair(PATH_HDFS_WITH_DATE_USER_GROUP_REGEX, String.format("%s %s$3$4 %s $6%s",
                    HDFS_USER_MASK, HDFS_GROUP_MASK, HDFS_DATE_MASK, HDFS_MASK)));
            add(toPatternPair(PATH_HDFS_REGEX, String.format("$1%s", HDFS_MASK)));
        }
    };

    private Pair<Pattern, String> toPatternPair(String patternStr, String maskComment) {
        return ImmutablePair.of(Pattern.compile(patternStr), maskComment);
    }

    public void addPatternWithMaskComment(String patternStr, String maskComment) {
        patternsWithMaskComments.add(toPatternPair(patternStr, maskComment));
    }

    public void initMasks(File qf, String query) {
        if (matches(PATTERN_MASK_STATS, query)) {
            qMaskStatsQuerySet.add(qf.getName());
        }
        if (matches(PATTERN_MASK_DATA_SIZE, query)) {
            qMaskDataSizeQuerySet.add(qf.getName());
        }
        if (matches(PATTERN_MASK_LINEAGE, query)) {
            qMaskLineageQuerySet.add(qf.getName());
        }
    }

    private boolean matches(Pattern pattern, String query) {
        Matcher matcher = pattern.matcher(query);
        if (matcher.find()) {
            return true;
        }
        return false;
    }

}