Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.orc.impl.mask; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.Text; import org.apache.orc.DataMask; import org.apache.orc.TypeDescription; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Calendar; import java.util.Map; import java.util.SortedMap; import java.util.TimeZone; import java.util.TreeMap; import java.util.concurrent.TimeUnit; /** * Masking strategy that hides most string and numeric values based on unicode * character categories. * * Masking Parameters: * character replacements: string of 10 characters one per group below * letter, upper case (default X) * letter, lower case (default x) * number, digit (default 9) * symbol (default $) * punctuation (default .) * separator (default no masking) * letter, other (default ) * mark (default ) * number, other (default ) * other (default ?) * * time replacements: string of 6 numbers or _ one per field below * year (0 to 4000, default no masking) * month (1 to 12, default 1) * date (1 to 31, default 1) * hour (0 to 23, default 0) * minute (0 to 59, default 0) * second (0 to 59, default 0) * * Parameters use "_" for preserve original. */ public class RedactMaskFactory extends MaskFactory { /** * The value to indicate that the value should be preserved. */ private static final int UNMASKED_CHAR = "_".codePointAt(0); private static final int UNMASKED_DATE = -1; // The default replacements for each character category. // I picked a character in the same category so that the masking is // idempotent. For non-ascii characters, I mostly picked the first example. private static final int DEFAULT_LETTER_UPPER = "X".codePointAt(0); private static final int DEFAULT_LETTER_LOWER = "x".codePointAt(0); private static final int DEFAULT_NUMBER_DIGIT = 9; private static final int DEFAULT_NUMBER_DIGIT_CP = Integer.toString(DEFAULT_NUMBER_DIGIT).codePointAt(0); private static final int DEFAULT_SYMBOL = "$".codePointAt(0); private static final int DEFAULT_PUNCTUATION = ".".codePointAt(0); private static final int DEFAULT_SEPARATOR = UNMASKED_CHAR; private static final int DEFAULT_LETTER_OTHER = "\u00AA".codePointAt(0); private static final int DEFAULT_MARK = "\u0903".codePointAt(0); private static final int DEFAULT_NUMBER_OTHER = "\u00B2".codePointAt(0); private static final int DEFAULT_OTHER = "\u06DD".codePointAt(0); // The replacement codepoint for each character category. We use codepoints // here so that we don't have to worry about handling long UTF characters // as special cases. private final int UPPPER_REPLACEMENT; private final int LOWER_REPLACEMENT; private final int OTHER_LETTER_REPLACEMENT; private final int MARK_REPLACEMENT; private final int DIGIT_CP_REPLACEMENT; private final int OTHER_NUMBER_REPLACEMENT; private final int SYMBOL_REPLACEMENT; private final int PUNCTUATION_REPLACEMENT; private final int SEPARATOR_REPLACEMENT; private final int OTHER_REPLACEMENT; // numeric replacement private final int DIGIT_REPLACEMENT; // time replacement private final int YEAR_REPLACEMENT; private final int MONTH_REPLACEMENT; private final int DATE_REPLACEMENT; private final int HOUR_REPLACEMENT; private final int MINUTE_REPLACEMENT; private final int SECOND_REPLACEMENT; private final boolean maskDate; private final boolean maskTimestamp; // index tuples that are not to be masked private final SortedMap<Integer, Integer> unmaskIndexRanges = new TreeMap<>(); public RedactMaskFactory(String... params) { ByteBuffer param = params.length < 1 ? ByteBuffer.allocate(0) : ByteBuffer.wrap(params[0].getBytes(StandardCharsets.UTF_8)); UPPPER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_UPPER); LOWER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_LOWER); DIGIT_CP_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_DIGIT_CP); DIGIT_REPLACEMENT = getReplacementDigit(DIGIT_CP_REPLACEMENT); SYMBOL_REPLACEMENT = getNextCodepoint(param, DEFAULT_SYMBOL); PUNCTUATION_REPLACEMENT = getNextCodepoint(param, DEFAULT_PUNCTUATION); SEPARATOR_REPLACEMENT = getNextCodepoint(param, DEFAULT_SEPARATOR); OTHER_LETTER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_OTHER); MARK_REPLACEMENT = getNextCodepoint(param, DEFAULT_MARK); OTHER_NUMBER_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_OTHER); OTHER_REPLACEMENT = getNextCodepoint(param, DEFAULT_OTHER); String[] timeParams; if (params.length < 2 || StringUtils.isBlank(params[1])) { timeParams = null; } else { timeParams = params[1].split("\\W+"); } YEAR_REPLACEMENT = getDateParam(timeParams, 0, UNMASKED_DATE, 4000); MONTH_REPLACEMENT = getDateParam(timeParams, 1, 1, 12); DATE_REPLACEMENT = getDateParam(timeParams, 2, 1, 31); HOUR_REPLACEMENT = getDateParam(timeParams, 3, 0, 23); MINUTE_REPLACEMENT = getDateParam(timeParams, 4, 0, 59); SECOND_REPLACEMENT = getDateParam(timeParams, 5, 0, 59); maskDate = (YEAR_REPLACEMENT != UNMASKED_DATE) || (MONTH_REPLACEMENT != UNMASKED_DATE) || (DATE_REPLACEMENT != UNMASKED_DATE); maskTimestamp = maskDate || (HOUR_REPLACEMENT != UNMASKED_DATE) || (MINUTE_REPLACEMENT != UNMASKED_DATE) || (SECOND_REPLACEMENT != UNMASKED_DATE); /* un-mask range */ if (!(params.length < 3 || StringUtils.isBlank(params[2]))) { String[] unmaskIndexes = params[2].split(","); for (int i = 0; i < unmaskIndexes.length; i++) { String[] pair = unmaskIndexes[i].trim().split(":"); unmaskIndexRanges.put(Integer.parseInt(pair[0]), Integer.parseInt(pair[1])); } } } @Override protected DataMask buildBooleanMask(TypeDescription schema) { if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { return new LongIdentity(); } else { return new BooleanRedactConverter(); } } @Override protected DataMask buildLongMask(TypeDescription schema) { if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { return new LongIdentity(); } else { return new LongRedactConverter(schema.getCategory()); } } @Override protected DataMask buildDecimalMask(TypeDescription schema) { if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { return new DecimalIdentity(); } else { return new DecimalRedactConverter(); } } @Override protected DataMask buildDoubleMask(TypeDescription schema) { if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { return new DoubleIdentity(); } else { return new DoubleRedactConverter(); } } @Override protected DataMask buildStringMask(TypeDescription schema) { return new StringConverter(); } @Override protected DataMask buildDateMask(TypeDescription schema) { if (maskDate) { return new DateRedactConverter(); } else { return new LongIdentity(); } } @Override protected DataMask buildTimestampMask(TypeDescription schema) { if (maskTimestamp) { return new TimestampRedactConverter(); } else { return new TimestampIdentity(); } } @Override protected DataMask buildBinaryMask(TypeDescription schema) { return new NullifyMask(); } class LongRedactConverter implements DataMask { final long mask; LongRedactConverter(TypeDescription.Category category) { switch (category) { case BYTE: mask = 0xff; break; case SHORT: mask = 0xffff; break; case INT: mask = 0xffff_ffff; break; default: case LONG: mask = -1; break; } } @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { LongColumnVector target = (LongColumnVector) masked; LongColumnVector source = (LongColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.vector[0] = maskLong(source.vector[0]) & mask; target.isNull[0] = source.isNull[0]; } else { for (int r = start; r < start + length; ++r) { target.vector[r] = maskLong(source.vector[r]) & mask; target.isNull[r] = source.isNull[r]; } } } } class BooleanRedactConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { LongColumnVector target = (LongColumnVector) masked; LongColumnVector source = (LongColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.vector[0] = DIGIT_REPLACEMENT == 0 ? 0 : 1; target.isNull[0] = source.isNull[0]; } else { for (int r = start; r < start + length; ++r) { target.vector[r] = DIGIT_REPLACEMENT == 0 ? 0 : 1; target.isNull[r] = source.isNull[r]; } } } } class DoubleRedactConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { DoubleColumnVector target = (DoubleColumnVector) masked; DoubleColumnVector source = (DoubleColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.vector[0] = maskDouble(source.vector[0]); target.isNull[0] = source.isNull[0]; } else { for (int r = start; r < start + length; ++r) { target.vector[r] = maskDouble(source.vector[r]); target.isNull[r] = source.isNull[r]; } } } } class StringConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { BytesColumnVector target = (BytesColumnVector) masked; BytesColumnVector source = (BytesColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.isNull[0] = source.isNull[0]; if (target.noNulls || !target.isNull[0]) { maskString(source, 0, target); } } else { for (int r = start; r < start + length; ++r) { target.isNull[r] = source.isNull[r]; if (target.noNulls || !target.isNull[r]) { maskString(source, r, target); } } } } } class DecimalRedactConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { DecimalColumnVector target = (DecimalColumnVector) masked; DecimalColumnVector source = (DecimalColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; target.scale = source.scale; target.precision = source.precision; if (original.isRepeating) { target.isNull[0] = source.isNull[0]; if (target.noNulls || !target.isNull[0]) { target.vector[0].set(maskDecimal(source.vector[0])); } } else { for (int r = start; r < start + length; ++r) { target.isNull[r] = source.isNull[r]; if (target.noNulls || !target.isNull[r]) { target.vector[r].set(maskDecimal(source.vector[r])); } } } } } class TimestampRedactConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { TimestampColumnVector target = (TimestampColumnVector) masked; TimestampColumnVector source = (TimestampColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.isNull[0] = source.isNull[0]; if (target.noNulls || !target.isNull[0]) { target.time[0] = maskTime(source.time[0]); target.nanos[0] = 0; } } else { for (int r = start; r < start + length; ++r) { target.isNull[r] = source.isNull[r]; if (target.noNulls || !target.isNull[r]) { target.time[r] = maskTime(source.time[r]); target.nanos[r] = 0; } } } } } class DateRedactConverter implements DataMask { @Override public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { LongColumnVector target = (LongColumnVector) masked; LongColumnVector source = (LongColumnVector) original; target.noNulls = original.noNulls; target.isRepeating = original.isRepeating; if (original.isRepeating) { target.isNull[0] = source.isNull[0]; if (target.noNulls || !target.isNull[0]) { target.vector[0] = maskDate((int) source.vector[0]); } } else { for (int r = start; r < start + length; ++r) { target.isNull[r] = source.isNull[r]; if (target.noNulls || !target.isNull[r]) { target.vector[r] = maskDate((int) source.vector[r]); } } } } } /** * Get the next code point from the ByteBuffer. Moves the position in the * ByteBuffer forward to the next code point. * @param param the source of bytes * @param defaultValue if there are no bytes left, use this value * @return the code point that was found at the front of the buffer. */ static int getNextCodepoint(ByteBuffer param, int defaultValue) { if (param.remaining() == 0) { return defaultValue; } else { return Text.bytesToCodePoint(param); } } /** * Get the replacement digit. This routine supports non-ASCII values for the * replacement. For example, if the user gives one of "7", "", "" or "?" * the value is 7. * @param digitCodePoint the code point that is replacing digits * @return the number from 0 to 9 to use as the numeric replacement */ static int getReplacementDigit(int digitCodePoint) { int dig = Character.getNumericValue(digitCodePoint); if (dig >= 0 && dig <= 9) { return dig; } else { return DEFAULT_NUMBER_DIGIT; } } static int getDateParam(String[] dateParams, int posn, int myDefault, int max) { if (dateParams != null && posn < dateParams.length) { if (dateParams[posn].codePointAt(0) == UNMASKED_CHAR) { return UNMASKED_DATE; } else { int result = Integer.parseInt(dateParams[posn]); if (result >= -1 && result <= max) { return result; } else { throw new IllegalArgumentException( "Invalid date parameter " + posn + " of " + dateParams[posn] + " greater than " + max); } } } else { return myDefault; } } /** * Replace each digit in value with DIGIT_REPLACEMENT scaled to the matching * number of digits. * @param value the number to mask * @return the masked value */ public long maskLong(long value) { /* check whether unmasking range provided */ if (!unmaskIndexRanges.isEmpty()) { return maskLongWithUnmasking(value); } long base; if (DIGIT_REPLACEMENT == 0) { return 0; } else if (value >= 0) { base = 1; } else { base = -1; // make sure Long.MIN_VALUE doesn't overflow if (value == Long.MIN_VALUE) { value = Long.MAX_VALUE; } else { value = -value; } } if (value < 100_000_000L) { if (value < 10_000L) { if (value < 100L) { if (value < 10L) { base *= 1; } else { base *= 11; } } else if (value < 1_000L) { base *= 111; } else { base *= 1_111; } } else if (value < 1_000_000L) { if (value < 100_000L) { base *= 11_111; } else { base *= 111_111; } } else if (value < 10_000_000L) { base *= 1_111_111; } else { base *= 11_111_111; } } else if (value < 10_000_000_000_000_000L) { if (value < 1_000_000_000_000L) { if (value < 10_000_000_000L) { if (value < 1_000_000_000L) { base *= 111_111_111; } else { base *= 1_111_111_111; } } else if (value < 100_000_000_000L) { base *= 11_111_111_111L; } else { base *= 111_111_111_111L; } } else if (value < 100_000_000_000_000L) { if (value < 10_000_000_000_000L) { base *= 1_111_111_111_111L; } else { base *= 11_111_111_111_111L; } } else if (value < 1_000_000_000_000_000L) { base *= 111_111_111_111_111L; } else { base *= 1_111_111_111_111_111L; } } else if (value < 100_000_000_000_000_000L) { base *= 11_111_111_111_111_111L; // If the digit is 9, it would overflow at 19 digits, so use 18. } else if (value < 1_000_000_000_000_000_000L || DIGIT_REPLACEMENT == 9) { base *= 111_111_111_111_111_111L; } else { base *= 1_111_111_111_111_111_111L; } return DIGIT_REPLACEMENT * base; } private static final double[] DOUBLE_POWER_10 = new double[] { 1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300, 1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291, 1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285, 1e-284, 1e-283, 1e-282, 1e-281, 1e-280, 1e-279, 1e-278, 1e-277, 1e-276, 1e-275, 1e-274, 1e-273, 1e-272, 1e-271, 1e-270, 1e-269, 1e-268, 1e-267, 1e-266, 1e-265, 1e-264, 1e-263, 1e-262, 1e-261, 1e-260, 1e-259, 1e-258, 1e-257, 1e-256, 1e-255, 1e-254, 1e-253, 1e-252, 1e-251, 1e-250, 1e-249, 1e-248, 1e-247, 1e-246, 1e-245, 1e-244, 1e-243, 1e-242, 1e-241, 1e-240, 1e-239, 1e-238, 1e-237, 1e-236, 1e-235, 1e-234, 1e-233, 1e-232, 1e-231, 1e-230, 1e-229, 1e-228, 1e-227, 1e-226, 1e-225, 1e-224, 1e-223, 1e-222, 1e-221, 1e-220, 1e-219, 1e-218, 1e-217, 1e-216, 1e-215, 1e-214, 1e-213, 1e-212, 1e-211, 1e-210, 1e-209, 1e-208, 1e-207, 1e-206, 1e-205, 1e-204, 1e-203, 1e-202, 1e-201, 1e-200, 1e-199, 1e-198, 1e-197, 1e-196, 1e-195, 1e-194, 1e-193, 1e-192, 1e-191, 1e-190, 1e-189, 1e-188, 1e-187, 1e-186, 1e-185, 1e-184, 1e-183, 1e-182, 1e-181, 1e-180, 1e-179, 1e-178, 1e-177, 1e-176, 1e-175, 1e-174, 1e-173, 1e-172, 1e-171, 1e-170, 1e-169, 1e-168, 1e-167, 1e-166, 1e-165, 1e-164, 1e-163, 1e-162, 1e-161, 1e-160, 1e-159, 1e-158, 1e-157, 1e-156, 1e-155, 1e-154, 1e-153, 1e-152, 1e-151, 1e-150, 1e-149, 1e-148, 1e-147, 1e-146, 1e-145, 1e-144, 1e-143, 1e-142, 1e-141, 1e-140, 1e-139, 1e-138, 1e-137, 1e-136, 1e-135, 1e-134, 1e-133, 1e-132, 1e-131, 1e-130, 1e-129, 1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120, 1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111, 1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102, 1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93, 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84, 1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307 }; /** * Replace each digit in value with digit. * @param value the number to mask * @return the */ public double maskDouble(double value) { /* check whether unmasking range provided */ if (!unmaskIndexRanges.isEmpty()) { return maskDoubleWIthUnmasking(value); } double base; // It seems better to mask 0 to 9.99999 rather than 9.99999e-308. if (value == 0 || DIGIT_REPLACEMENT == 0) { return DIGIT_REPLACEMENT * 1.11111; } else if (value > 0) { base = 1.11111; } else { base = -1.11111; value = -value; } int posn = Arrays.binarySearch(DOUBLE_POWER_10, value); if (posn < -DOUBLE_POWER_10.length - 2) { posn = DOUBLE_POWER_10.length - 1; } else if (posn == -1) { posn = 0; } else if (posn < 0) { posn = -posn - 2; } return DIGIT_REPLACEMENT * base * DOUBLE_POWER_10[posn]; } private final Calendar scratch = Calendar.getInstance(); /** * Given the requested masking parameters, redact the given time * @param millis the original time * @return the millis after it has been masked */ long maskTime(long millis) { scratch.setTimeInMillis(millis); if (YEAR_REPLACEMENT != UNMASKED_DATE) { scratch.set(Calendar.YEAR, YEAR_REPLACEMENT); } if (MONTH_REPLACEMENT != UNMASKED_DATE) { scratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1); } if (DATE_REPLACEMENT != UNMASKED_DATE) { scratch.set(Calendar.DATE, DATE_REPLACEMENT); } if (HOUR_REPLACEMENT != UNMASKED_DATE) { if (HOUR_REPLACEMENT >= 12) { scratch.set(Calendar.HOUR, HOUR_REPLACEMENT - 12); scratch.set(Calendar.AM_PM, Calendar.PM); } else { scratch.set(Calendar.HOUR, HOUR_REPLACEMENT); scratch.set(Calendar.AM_PM, Calendar.AM); } } if (MINUTE_REPLACEMENT != UNMASKED_DATE) { scratch.set(Calendar.MINUTE, MINUTE_REPLACEMENT); } if (SECOND_REPLACEMENT != UNMASKED_DATE) { scratch.set(Calendar.SECOND, SECOND_REPLACEMENT); scratch.set(Calendar.MILLISECOND, 0); } return scratch.getTimeInMillis(); } private static final long MILLIS_PER_DAY = TimeUnit.DAYS.toMillis(1); private final Calendar utcScratch = Calendar.getInstance(TimeZone.getTimeZone("UTC")); /** * Given a date as the number of days since epoch (1 Jan 1970), * mask the date given the parameters. * @param daysSinceEpoch the number of days after epoch * @return the number of days after epoch when masked */ int maskDate(int daysSinceEpoch) { utcScratch.setTimeInMillis(daysSinceEpoch * MILLIS_PER_DAY); if (YEAR_REPLACEMENT != UNMASKED_DATE) { utcScratch.set(Calendar.YEAR, YEAR_REPLACEMENT); } if (MONTH_REPLACEMENT != UNMASKED_DATE) { utcScratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1); } if (DATE_REPLACEMENT != UNMASKED_DATE) { utcScratch.set(Calendar.DATE, DATE_REPLACEMENT); } return (int) (utcScratch.getTimeInMillis() / MILLIS_PER_DAY); } /** * Mask a decimal. * This is painfully slow because it converts to a string and then back to * a decimal. Until HiveDecimalWritable gives us more access, this is * the best tradeoff between developer time, functionality, and run time. * @param source the value to mask * @return the masked value. */ HiveDecimalWritable maskDecimal(HiveDecimalWritable source) { return new HiveDecimalWritable(maskNumericString(source.toString())); } /** * Given a UTF code point, find the replacement codepoint * @param codepoint a UTF character * @return the replacement codepoint */ int getReplacement(int codepoint) { switch (Character.getType(codepoint)) { case Character.UPPERCASE_LETTER: return UPPPER_REPLACEMENT; case Character.LOWERCASE_LETTER: return LOWER_REPLACEMENT; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: return OTHER_LETTER_REPLACEMENT; case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: case Character.COMBINING_SPACING_MARK: return MARK_REPLACEMENT; case Character.DECIMAL_DIGIT_NUMBER: return DIGIT_CP_REPLACEMENT; case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: return OTHER_NUMBER_REPLACEMENT; case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: return SEPARATOR_REPLACEMENT; case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: return SYMBOL_REPLACEMENT; case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: return PUNCTUATION_REPLACEMENT; default: return OTHER_REPLACEMENT; } } /** * Get the number of bytes for each codepoint * @param codepoint the codepoint to check * @return the number of bytes */ static int getCodepointLength(int codepoint) { if (codepoint < 0) { throw new IllegalArgumentException("Illegal codepoint " + codepoint); } else if (codepoint < 0x80) { return 1; } else if (codepoint < 0x7ff) { return 2; } else if (codepoint < 0xffff) { return 3; } else if (codepoint < 0x10FFFF) { return 4; } else { throw new IllegalArgumentException("Illegal codepoint " + codepoint); } } /** * Write the give codepoint to the buffer. * @param codepoint the codepoint to write * @param buffer the buffer to write into * @param offset the first offset to use * @param length the number of bytes that will be used */ static void writeCodepoint(int codepoint, byte[] buffer, int offset, int length) { switch (length) { case 1: buffer[offset] = (byte) codepoint; break; case 2: buffer[offset] = (byte) (0xC0 | codepoint >> 6); buffer[offset + 1] = (byte) (0x80 | (codepoint & 0x3f)); break; case 3: buffer[offset] = (byte) (0xE0 | codepoint >> 12); buffer[offset + 1] = (byte) (0x80 | ((codepoint >> 6) & 0x3f)); buffer[offset + 2] = (byte) (0x80 | (codepoint & 0x3f)); break; case 4: buffer[offset] = (byte) (0xF0 | codepoint >> 18); buffer[offset + 1] = (byte) (0x80 | ((codepoint >> 12) & 0x3f)); buffer[offset + 2] = (byte) (0x80 | ((codepoint >> 6) & 0x3f)); buffer[offset + 3] = (byte) (0x80 | (codepoint & 0x3f)); break; default: throw new IllegalArgumentException("Invalid length for codepoint " + codepoint + " = " + length); } } /** * Mask a string by finding the character category of each character * and replacing it with the matching literal. * @param source the source column vector * @param row the value index * @param target the target column vector */ void maskString(BytesColumnVector source, int row, BytesColumnVector target) { int expectedBytes = source.length[row]; ByteBuffer sourceBytes = ByteBuffer.wrap(source.vector[row], source.start[row], source.length[row]); // ensure we have enough space, if the masked data is the same size target.ensureValPreallocated(expectedBytes); byte[] outputBuffer = target.getValPreallocatedBytes(); int outputOffset = target.getValPreallocatedStart(); int outputStart = outputOffset; int index = 0; while (sourceBytes.remaining() > 0) { int cp = Text.bytesToCodePoint(sourceBytes); // Find the replacement for the current character. int replacement = getReplacement(cp); if (replacement == UNMASKED_CHAR || isIndexInUnmaskRange(index, source.length[row])) { replacement = cp; } // increment index index++; int len = getCodepointLength(replacement); // If the translation will overflow the buffer, we need to resize. // This will only happen when the masked size is larger than the original. if (len + outputOffset > outputBuffer.length) { // Revise estimate how much we are going to need now. We are maximally // pesamistic here so that we don't have to expand again for this value. int currentOutputStart = outputStart; int currentOutputLength = outputOffset - currentOutputStart; expectedBytes = currentOutputLength + len + sourceBytes.remaining() * 4; // Expand the buffer to fit the new estimate target.ensureValPreallocated(expectedBytes); // Copy over the bytes we've already written for this value and move // the pointers to the new output buffer. byte[] oldBuffer = outputBuffer; outputBuffer = target.getValPreallocatedBytes(); outputOffset = target.getValPreallocatedStart(); outputStart = outputOffset; System.arraycopy(oldBuffer, currentOutputStart, outputBuffer, outputOffset, currentOutputLength); outputOffset += currentOutputLength; } // finally copy the bytes writeCodepoint(replacement, outputBuffer, outputOffset, len); outputOffset += len; } target.setValPreallocated(row, outputOffset - outputStart); } static final long OVERFLOW_REPLACEMENT = 111_111_111_111_111_111L; /** * A function that masks longs when there are unmasked ranges. * @param value the original value * @return the masked value */ long maskLongWithUnmasking(long value) throws IndexOutOfBoundsException { try { return Long.parseLong(maskNumericString(Long.toString(value))); } catch (NumberFormatException nfe) { return OVERFLOW_REPLACEMENT * DIGIT_REPLACEMENT; } } /** * A function that masks doubles when there are unmasked ranges. * @param value original value * @return masked value */ double maskDoubleWIthUnmasking(final double value) { try { return Double.parseDouble(maskNumericString(Double.toString(value))); } catch (NumberFormatException nfe) { return OVERFLOW_REPLACEMENT * DIGIT_REPLACEMENT; } } /** * Mask the given stringified numeric value excluding the unmask range. * Non-digit characters are passed through on the assumption they are * markers (eg. one of ",.ef"). * @param value the original value. */ String maskNumericString(final String value) { StringBuilder result = new StringBuilder(); final int length = value.codePointCount(0, value.length()); for (int c = 0; c < length; ++c) { int cp = value.codePointAt(c); if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { result.appendCodePoint(cp); } else { result.appendCodePoint(DIGIT_CP_REPLACEMENT); } } return result.toString(); } /** * Given an index and length of a string * find out whether it is in a given un-mask range. * @param index the character point index * @param length the length of the string in character points * @return true if the index is in un-mask range else false. */ private boolean isIndexInUnmaskRange(final int index, final int length) { for (final Map.Entry<Integer, Integer> pair : unmaskIndexRanges.entrySet()) { int start; int end; if (pair.getKey() >= 0) { // for positive indexes start = pair.getKey(); } else { // for negative indexes start = length + pair.getKey(); } if (pair.getValue() >= 0) { // for positive indexes end = pair.getValue(); } else { // for negative indexes end = length + pair.getValue(); } // if the given index is in range if (index >= start && index <= end) { return true; } } return false; } }