com.ebay.erl.mobius.core.criterion.TupleRestrictions.java Source code

Java tutorial

Introduction

Here is the source code for com.ebay.erl.mobius.core.criterion.TupleRestrictions.java

Source

package com.ebay.erl.mobius.core.criterion;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;

import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.collection.CaseInsensitiveTreeSet;
import com.ebay.erl.mobius.core.model.Tuple;

/**
 * Factory class that provides methods to define {@link TupleCriterion}
 * for filtering {@link Tuple}s in a {@link Dataset}.
 * 
 * <p>
 * This product is licensed under the Apache License,  Version 2.0, 
 * available at http://www.apache.org/licenses/LICENSE-2.0.
 * 
 * This product contains portions derived from Apache hadoop which is 
 * licensed under the Apache License, Version 2.0, available at 
 * http://hadoop.apache.org.
 * 
 *  2007  2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
 */
public class TupleRestrictions {
    /**
     * Hadoop configuration
     */
    protected static Configuration conf;

    /**
     * Setup Hadoop configuration.
     */
    public static final void configure(Configuration conf) {
        TupleRestrictions.conf = conf;
    }

    private static File checkFileExist(File file) throws FileNotFoundException {
        if (TupleRestrictions.conf != null && TupleRestrictions.conf.get("mobius.studio.workspace.base") != null) {
            File base = new File(TupleRestrictions.conf.get("mobius.studio.workspace.base"));
            File f = new File(base, file.getName());
            if (!f.exists()) {
                throw new FileNotFoundException("File not found in:" + f.getAbsolutePath());
            }
            return f;
        } else {
            if (!file.exists()) {
                throw new FileNotFoundException("File not found:" + file.getAbsolutePath());
            }
            return file;
        }
    }

    /**
     * Create a {@link TupleCriterion} that only accept tuples with 
     * the value of the specified <code>column</code> that is within
     * the provide <code>list</code>. 
     * <p>
     * 
     * The value of the <code>column</code> will be converted into
     * string, if it's not string, to compare.
     */
    public static TupleCriterion withinString(final String column, final ArrayList<String> list) {
        return new StringCriterion(column, list, RelationalOperator.WITHIN);
    }

    /**
     * Create a {@link TupleCriterion} that only accepts tuples with 
     * the value of the specified <code>column</code> that is within
     * the provide <code>list</code>.
     * <p>
     * 
     * The value of the <code>column</code> will be converted into
     * double, if it's not number, to compare.
     */
    public static TupleCriterion withinNumber(final String column, final ArrayList<Double> list) {
        return new NumberCriterion(column, list, RelationalOperator.WITHIN);
    }

    /**
     * Create a tuple criterion that only accepts tuples when the value 
     * of the <code>column</code> are presented in the given <code>file</code>
     * <p>
     * 
     * The assumption of the file is that, it's single column and one to many
     * line text file.  Each line is read into a case insensitive set, and 
     * using the set to check the value of the <code>column</code> within
     * the set or not.
     * 
     * 
     * @param column the name of a column to be tested that whether its value is in 
     * the given <code>file</code> or not
     * 
     * @param file a single column and multiple lines of file that contains strings/numbers,
     * each line is treated as a single unit.
     *
     * @return an instance of {@link TupleCriterion} that extracts only the records 
     * when the value of its <code>column</code> are presented in the given 
     * <code>file</code>.
     * 
     * @throws FileNotFoundException if the given file cannot be found.
     */
    public static TupleCriterion within(final String column, File file) throws FileNotFoundException {
        final File f = TupleRestrictions.checkFileExist(file);

        return new TupleCriterion() {

            private static final long serialVersionUID = -1121221619118915652L;
            private Set<String> set;

            @Override
            public void setConf(Configuration conf) {
                try {
                    if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) {
                        conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf));
                    } else {
                        conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles"));
                    }

                } catch (IOException e) {
                    throw new IllegalArgumentException(e);
                }
            }

            /**
             * COPIED FROM org.apache.hadoop.util.GenericOptionsParser
             */
            private String validateFiles(String files, Configuration conf) throws IOException {
                if (files == null)
                    return null;
                String[] fileArr = files.split(",");
                String[] finalArr = new String[fileArr.length];
                for (int i = 0; i < fileArr.length; i++) {
                    String tmp = fileArr[i];
                    String finalPath;
                    Path path = new Path(tmp);
                    URI pathURI = path.toUri();
                    FileSystem localFs = FileSystem.getLocal(conf);
                    if (pathURI.getScheme() == null) {
                        // default to the local file system
                        // check if the file exists or not first
                        if (!localFs.exists(path)) {
                            throw new FileNotFoundException("File " + tmp + " does not exist.");
                        }
                        finalPath = path.makeQualified(localFs).toString();
                    } else {
                        // check if the file exists in this file system
                        // we need to recreate this filesystem object to copy
                        // these files to the file system jobtracker is running
                        // on.
                        FileSystem fs = path.getFileSystem(conf);
                        if (!fs.exists(path)) {
                            throw new FileNotFoundException("File " + tmp + " does not exist.");
                        }
                        finalPath = path.makeQualified(fs).toString();
                        try {
                            fs.close();
                        } catch (IOException e) {
                        }
                        ;
                    }
                    finalArr[i] = finalPath;
                }
                return StringUtils.arrayToString(finalArr);
            }

            @Override
            protected boolean evaluate(Tuple tuple, Configuration configuration) {
                if (set == null) {
                    set = new CaseInsensitiveTreeSet();
                    BufferedReader br = null;
                    try {
                        br = new BufferedReader(new FileReader(new File(f.getName())));
                        String newLine = null;
                        while ((newLine = br.readLine()) != null) {
                            this.set.add(newLine);
                        }
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    } finally {
                        try {
                            br.close();
                        } catch (Throwable e) {
                        }
                    }
                }

                String value = tuple.getString(column);
                if (value != null) {
                    return this.set.contains(value);
                } else {
                    return false;
                }
            }

            @Override
            public String[] getInvolvedColumns() {
                return new String[] { column };
            }
        };
    }

    /**
     * Create a tuple criterion that only accepts tuples 
     * with the value of <code>column</code> that is <b>NOT</b> 
     * presented in the given <code>file</code>
     * 
     * The assumption of the file is that, it's single column and one to many
     * line text file.  Each line is read into a case insensitive set, and 
     * using the set to check the value of the <code>column</code> within
     * the set or not.
     * 
     * @param column the name of a column to be tested that whether its value is in 
     * the given <code>file</code> or not
     * 
     * @param file a single column and multiple lines of file that contains strings/numbers,
     * each line is treated as a single unit.
     * 
     * @return an instance of {@link TupleCriterion} that extracts only the records 
     * when the value of its <code>column</code> are <b>NOT</b>presented in the given 
     * <code>file</code>.
     * 
     * @throws FileNotFoundException if the given file cannot be found.
     */
    public static TupleCriterion not_within(final String column, final File file) throws FileNotFoundException {
        TupleCriterion criterion = TupleRestrictions.within(column, file);
        TupleCriterion notCriterion = criterion.not();
        return notCriterion;
    }

    /**
     * Create a {@link TupleCriterion} that only accept tuples with 
     * the value of the specified <code>column</code> is <b>not</b>
     * within the provide <code>list</code>.
     * <p>
     * 
     * The value of the <code>column</code> will be converted into
     * double to compare, if it's not double.
     */
    public static TupleCriterion notWithinNumber(final String column, final ArrayList<Double> values) {
        return TupleRestrictions.withinNumber(column, values).not();
    }

    /**
     * Create a {@link TupleCriterion} that only accept tuples with 
     * the value of the specified <code>column</code> is <b>not</b>
     * within the provide <code>list</code>.
     * <p>
     * 
     * The value of the <code>column</code> will be converted into
     * string to compare, if it's not string.
     */
    public static TupleCriterion notWithinString(final String column, final ArrayList<String> values) {
        return TupleRestrictions.withinString(column, values).not();
    }

    /**
     * Define a {@link TupleCriterion} that only extracts records when the value of the
     * <code>column</code> meets the <cdoe>regex</code>.
     * 
     * @param column the name of a column to be tested on its value whether it meets
     * the specified <code>regex</code> or not.
     * 
     * @param regex a regular expression to test.
     * 
     * @return a {@link TupleCriterion} accepts value from the <code>column</code>
     * match the given <code>regex</code>.
     * 
     */
    public static TupleCriterion regex(final String column, final String regex) {
        return new TupleCriterion() {

            private static final long serialVersionUID = -6630104271777176036L;
            private transient Pattern pattern = Pattern.compile(regex);
            private transient Matcher matcher = pattern.matcher("");

            @Override
            protected boolean evaluate(Tuple tuple, Configuration configuration) {
                if (pattern == null) {
                    pattern = Pattern.compile(regex);
                    matcher = pattern.matcher("");
                }

                String value = tuple.getString(column);
                if (value != null) {
                    matcher.reset(value);
                    return matcher.find();
                } else {
                    return false;
                }
            }

            @Override
            public String[] getInvolvedColumns() {
                return new String[] { column };
            }
        };
    }

    /**
     * Create a {@link TupleCriterion} that only accepts 
     * tuples with the value of the given <code>column<code>
     * is not null nor empty string.
     */
    public static TupleCriterion notNull(final String column) {

        return new TupleCriterion() {

            private static final long serialVersionUID = 1573625916312469904L;

            @Override
            protected boolean evaluate(Tuple tuple, Configuration configuration) {
                return tuple.get(column) != null && tuple.getString(column).trim().length() > 0;
            }

            @Override
            public String[] getInvolvedColumns() {
                return new String[] { column };
            }
        };
    }

    /**
     * Specify the given <code>column</code>'s value equals to <code>value</code>
     */
    public static TupleCriterion eq(String column, String value) {
        return new StringCriterion(column, value, RelationalOperator.EQ);
    }

    /**
     * Specify the given <code>column</code>'s value equals to <code>value</code>
     */
    public static TupleCriterion eq(String column, Number value) {
        return new NumberCriterion(column, value.doubleValue(), RelationalOperator.EQ);
    }

    /**
     * Specify the given <code>column</code>'s value equals to <code>trueFalse</code>
     */
    public static TupleCriterion eq(final String column, final boolean trueFalse) {
        return new TupleCriterion() {
            private static final long serialVersionUID = 3652448730224390852L;

            @Override
            protected boolean evaluate(Tuple tuple, Configuration configuration) {
                return tuple.getBoolean(column) == trueFalse;
            }

            @Override
            public String[] getInvolvedColumns() {
                return new String[] { column };
            }

        };
    }

    /**
     * Return a {@link TupleCriterion} that parses the value of <column>column</column>
     * with the given <column>columnDateFormat</column> into milliseconds, comparing the
     * milliseconds (A) with the <code>date</code> (B) and only accept tuples records when 
     * A equals to B.
     * 
     * @param column name of a column to be tested in a dataset.
     * 
     * @param columnDateFormat the date format of the specified <code>column</code> in the dataset. 
     * The <code>columnFormat</code> pattern is the same as {@link java.text.SimpleDateFormat}
     * 
     * @param date a date constraint to be test.
     * 
     */
    public static TupleCriterion eq(String column, String columnDateFormat, java.util.Date date) {
        return new DateCriterion(column, columnDateFormat, date.getTime(), RelationalOperator.EQ);
    }

    /**
     * Return a {@link TupleCriterion} that only accepts tuples with 
     * the value of <code>column</code> is equal to the specified 
     * <code>date</code>.
     * <p>
     * 
     * If the type of the value for the <code>column</code> is and instance 
     * of {@link java.util.Date}, then the comparison is done by calling the
     * method of {@link java.util.Date#getTime()} for the value and compare
     * it with <code>date.getTime()</code>.
     * <p>
     * 
     * If the type of the value is not an instance of {@link java.util.Date},
     * then it will be parsed into date format using either the format of 
     * <code>yyyy-MM-dd</code> or <code>yyyy-MM-dd HH:mm:ss</code>.
     * 
     */
    public static TupleCriterion eq(String column, java.util.Date date) {
        return new DateCriterion(column, null, date.getTime(), RelationalOperator.EQ);
    }

    /**
     * Create a {@link TupleCriterion} that only accepts tuples with 
     * the two columns' values are equals.
     */
    public static TupleCriterion eqColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.EQ);
    }

    /**
     * not equals
     */
    public static TupleCriterion ne(String columnName, String value) {
        return new StringCriterion(columnName, value, RelationalOperator.NE);
    }

    public static TupleCriterion ne(String columnName, Number value) {
        return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.NE);
    }

    public static TupleCriterion ne(String columnName, String columnFormat, Date date) {
        return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.NE);
    }

    public static TupleCriterion ne(String columnName, Date date) {
        return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.NE);
    }

    public static TupleCriterion ne(String columnName, String columnFormat, Calendar date) {
        return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.NE);
    }

    public static TupleCriterion ne(String columnName, Calendar date) {
        return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.NE);
    }

    public static TupleCriterion ne(final String column, final boolean trueFalse) {
        return new TupleCriterion() {
            private static final long serialVersionUID = 3652448730224390852L;

            @Override
            protected boolean evaluate(Tuple tuple, Configuration configuration) {
                return tuple.getBoolean(column) != trueFalse;
            }

            @Override
            public String[] getInvolvedColumns() {
                return new String[] { column };
            }

        };
    }

    /**
     * compare if two column's values are not equals.
     */
    public static TupleCriterion neColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.NE);
    }

    /**
     * greater than
     */
    public static TupleCriterion gt(String columnName, String value) {
        return new StringCriterion(columnName, value, RelationalOperator.GT);
    }

    public static TupleCriterion gt(String columnName, Number value) {
        return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GT);
    }

    public static TupleCriterion gt(String columnName, String columnFormat, Date date) {
        return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.GT);
    }

    public static TupleCriterion gt(String columnName, Date date) {
        return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.GT);
    }

    public static TupleCriterion gt(String columnName, String columnFormat, Calendar date) {
        return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.GT);
    }

    public static TupleCriterion gt(String columnName, Calendar date) {
        return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.GT);
    }

    /**
     * compare if column1's value greater than column2's value
     */
    public static TupleCriterion gtColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.GT);
    }

    /**
     * greater than or equal
     */
    public static TupleCriterion ge(String columnName, String value) {
        return new StringCriterion(columnName, value, RelationalOperator.GE);
    }

    public static TupleCriterion ge(String columnName, Number value) {
        return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GE);
    }

    public static TupleCriterion ge(String columnName, String columnFormat, Date date) {
        return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.GE);
    }

    public static TupleCriterion ge(String columnName, Date date) {
        return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.GE);
    }

    public static TupleCriterion ge(String columnName, String columnFormat, Calendar date) {
        return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.GE);
    }

    public static TupleCriterion ge(String columnName, Calendar date) {
        return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.GE);
    }

    /**
     * compare if column1's value greater or equals to column2's value
     */
    public static TupleCriterion geColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.GE);
    }

    /**
     * less than or equal
     */
    public static TupleCriterion le(String columnName, String value) {
        return new StringCriterion(columnName, value, RelationalOperator.LE);
    }

    public static TupleCriterion le(String columnName, Number value) {
        return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LE);
    }

    public static TupleCriterion le(String columnName, String columnFormat, Date date) {
        return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.LE);
    }

    public static TupleCriterion le(String columnName, Date date) {
        return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.LE);
    }

    public static TupleCriterion le(String columnName, String columnFormat, Calendar date) {
        return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.LE);
    }

    public static TupleCriterion le(String columnName, Calendar date) {
        return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.LE);
    }

    /**
     * compare if column1's value less than column2's value
     */
    public static TupleCriterion leColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.LE);
    }

    /**
     * less than
     */
    public static TupleCriterion lt(String columnName, String value) {
        return new StringCriterion(columnName, value, RelationalOperator.LT);
    }

    public static TupleCriterion lt(String columnName, Number value) {
        return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LT);
    }

    public static TupleCriterion lt(String columnName, String columnFormat, Date date) {
        return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.LT);
    }

    public static TupleCriterion lt(String columnName, Date date) {
        return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.LT);
    }

    public static TupleCriterion lt(String columnName, String columnFormat, Calendar date) {
        return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.LT);
    }

    public static TupleCriterion lt(String columnName, Calendar date) {
        return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.LT);
    }

    /**
     * compare if column1's value less or equals to column2's value
     */
    public static TupleCriterion ltColumns(final String column1, final String column2) {
        return new ColumnsCriterion(column1, column2, RelationalOperator.LT);
    }
}