Java tutorial
package com.ebay.erl.mobius.core.criterion; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URI; import java.util.Date; import java.util.ArrayList; import java.util.Calendar; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.StringUtils; import com.ebay.erl.mobius.core.builder.Dataset; import com.ebay.erl.mobius.core.collection.CaseInsensitiveTreeSet; import com.ebay.erl.mobius.core.model.Tuple; /** * Factory class that provides methods to define {@link TupleCriterion} * for filtering {@link Tuple}s in a {@link Dataset}. * * <p> * This product is licensed under the Apache License, Version 2.0, * available at http://www.apache.org/licenses/LICENSE-2.0. * * This product contains portions derived from Apache hadoop which is * licensed under the Apache License, Version 2.0, available at * http://hadoop.apache.org. * * 2007 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan */ public class TupleRestrictions { /** * Hadoop configuration */ protected static Configuration conf; /** * Setup Hadoop configuration. */ public static final void configure(Configuration conf) { TupleRestrictions.conf = conf; } private static File checkFileExist(File file) throws FileNotFoundException { if (TupleRestrictions.conf != null && TupleRestrictions.conf.get("mobius.studio.workspace.base") != null) { File base = new File(TupleRestrictions.conf.get("mobius.studio.workspace.base")); File f = new File(base, file.getName()); if (!f.exists()) { throw new FileNotFoundException("File not found in:" + f.getAbsolutePath()); } return f; } else { if (!file.exists()) { throw new FileNotFoundException("File not found:" + file.getAbsolutePath()); } return file; } } /** * Create a {@link TupleCriterion} that only accept tuples with * the value of the specified <code>column</code> that is within * the provide <code>list</code>. * <p> * * The value of the <code>column</code> will be converted into * string, if it's not string, to compare. */ public static TupleCriterion withinString(final String column, final ArrayList<String> list) { return new StringCriterion(column, list, RelationalOperator.WITHIN); } /** * Create a {@link TupleCriterion} that only accepts tuples with * the value of the specified <code>column</code> that is within * the provide <code>list</code>. * <p> * * The value of the <code>column</code> will be converted into * double, if it's not number, to compare. */ public static TupleCriterion withinNumber(final String column, final ArrayList<Double> list) { return new NumberCriterion(column, list, RelationalOperator.WITHIN); } /** * Create a tuple criterion that only accepts tuples when the value * of the <code>column</code> are presented in the given <code>file</code> * <p> * * The assumption of the file is that, it's single column and one to many * line text file. Each line is read into a case insensitive set, and * using the set to check the value of the <code>column</code> within * the set or not. * * * @param column the name of a column to be tested that whether its value is in * the given <code>file</code> or not * * @param file a single column and multiple lines of file that contains strings/numbers, * each line is treated as a single unit. * * @return an instance of {@link TupleCriterion} that extracts only the records * when the value of its <code>column</code> are presented in the given * <code>file</code>. * * @throws FileNotFoundException if the given file cannot be found. */ public static TupleCriterion within(final String column, File file) throws FileNotFoundException { final File f = TupleRestrictions.checkFileExist(file); return new TupleCriterion() { private static final long serialVersionUID = -1121221619118915652L; private Set<String> set; @Override public void setConf(Configuration conf) { try { if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf)); } else { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles")); } } catch (IOException e) { throw new IllegalArgumentException(e); } } /** * COPIED FROM org.apache.hadoop.util.GenericOptionsParser */ private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(","); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; Path path = new Path(tmp); URI pathURI = path.toUri(); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); try { fs.close(); } catch (IOException e) { } ; } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); } @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { if (set == null) { set = new CaseInsensitiveTreeSet(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(new File(f.getName()))); String newLine = null; while ((newLine = br.readLine()) != null) { this.set.add(newLine); } } catch (IOException e) { throw new RuntimeException(e); } finally { try { br.close(); } catch (Throwable e) { } } } String value = tuple.getString(column); if (value != null) { return this.set.contains(value); } else { return false; } } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; } /** * Create a tuple criterion that only accepts tuples * with the value of <code>column</code> that is <b>NOT</b> * presented in the given <code>file</code> * * The assumption of the file is that, it's single column and one to many * line text file. Each line is read into a case insensitive set, and * using the set to check the value of the <code>column</code> within * the set or not. * * @param column the name of a column to be tested that whether its value is in * the given <code>file</code> or not * * @param file a single column and multiple lines of file that contains strings/numbers, * each line is treated as a single unit. * * @return an instance of {@link TupleCriterion} that extracts only the records * when the value of its <code>column</code> are <b>NOT</b>presented in the given * <code>file</code>. * * @throws FileNotFoundException if the given file cannot be found. */ public static TupleCriterion not_within(final String column, final File file) throws FileNotFoundException { TupleCriterion criterion = TupleRestrictions.within(column, file); TupleCriterion notCriterion = criterion.not(); return notCriterion; } /** * Create a {@link TupleCriterion} that only accept tuples with * the value of the specified <code>column</code> is <b>not</b> * within the provide <code>list</code>. * <p> * * The value of the <code>column</code> will be converted into * double to compare, if it's not double. */ public static TupleCriterion notWithinNumber(final String column, final ArrayList<Double> values) { return TupleRestrictions.withinNumber(column, values).not(); } /** * Create a {@link TupleCriterion} that only accept tuples with * the value of the specified <code>column</code> is <b>not</b> * within the provide <code>list</code>. * <p> * * The value of the <code>column</code> will be converted into * string to compare, if it's not string. */ public static TupleCriterion notWithinString(final String column, final ArrayList<String> values) { return TupleRestrictions.withinString(column, values).not(); } /** * Define a {@link TupleCriterion} that only extracts records when the value of the * <code>column</code> meets the <cdoe>regex</code>. * * @param column the name of a column to be tested on its value whether it meets * the specified <code>regex</code> or not. * * @param regex a regular expression to test. * * @return a {@link TupleCriterion} accepts value from the <code>column</code> * match the given <code>regex</code>. * */ public static TupleCriterion regex(final String column, final String regex) { return new TupleCriterion() { private static final long serialVersionUID = -6630104271777176036L; private transient Pattern pattern = Pattern.compile(regex); private transient Matcher matcher = pattern.matcher(""); @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { if (pattern == null) { pattern = Pattern.compile(regex); matcher = pattern.matcher(""); } String value = tuple.getString(column); if (value != null) { matcher.reset(value); return matcher.find(); } else { return false; } } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; } /** * Create a {@link TupleCriterion} that only accepts * tuples with the value of the given <code>column<code> * is not null nor empty string. */ public static TupleCriterion notNull(final String column) { return new TupleCriterion() { private static final long serialVersionUID = 1573625916312469904L; @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { return tuple.get(column) != null && tuple.getString(column).trim().length() > 0; } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; } /** * Specify the given <code>column</code>'s value equals to <code>value</code> */ public static TupleCriterion eq(String column, String value) { return new StringCriterion(column, value, RelationalOperator.EQ); } /** * Specify the given <code>column</code>'s value equals to <code>value</code> */ public static TupleCriterion eq(String column, Number value) { return new NumberCriterion(column, value.doubleValue(), RelationalOperator.EQ); } /** * Specify the given <code>column</code>'s value equals to <code>trueFalse</code> */ public static TupleCriterion eq(final String column, final boolean trueFalse) { return new TupleCriterion() { private static final long serialVersionUID = 3652448730224390852L; @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { return tuple.getBoolean(column) == trueFalse; } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; } /** * Return a {@link TupleCriterion} that parses the value of <column>column</column> * with the given <column>columnDateFormat</column> into milliseconds, comparing the * milliseconds (A) with the <code>date</code> (B) and only accept tuples records when * A equals to B. * * @param column name of a column to be tested in a dataset. * * @param columnDateFormat the date format of the specified <code>column</code> in the dataset. * The <code>columnFormat</code> pattern is the same as {@link java.text.SimpleDateFormat} * * @param date a date constraint to be test. * */ public static TupleCriterion eq(String column, String columnDateFormat, java.util.Date date) { return new DateCriterion(column, columnDateFormat, date.getTime(), RelationalOperator.EQ); } /** * Return a {@link TupleCriterion} that only accepts tuples with * the value of <code>column</code> is equal to the specified * <code>date</code>. * <p> * * If the type of the value for the <code>column</code> is and instance * of {@link java.util.Date}, then the comparison is done by calling the * method of {@link java.util.Date#getTime()} for the value and compare * it with <code>date.getTime()</code>. * <p> * * If the type of the value is not an instance of {@link java.util.Date}, * then it will be parsed into date format using either the format of * <code>yyyy-MM-dd</code> or <code>yyyy-MM-dd HH:mm:ss</code>. * */ public static TupleCriterion eq(String column, java.util.Date date) { return new DateCriterion(column, null, date.getTime(), RelationalOperator.EQ); } /** * Create a {@link TupleCriterion} that only accepts tuples with * the two columns' values are equals. */ public static TupleCriterion eqColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.EQ); } /** * not equals */ public static TupleCriterion ne(String columnName, String value) { return new StringCriterion(columnName, value, RelationalOperator.NE); } public static TupleCriterion ne(String columnName, Number value) { return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.NE); } public static TupleCriterion ne(String columnName, String columnFormat, Date date) { return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.NE); } public static TupleCriterion ne(String columnName, Date date) { return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.NE); } public static TupleCriterion ne(String columnName, String columnFormat, Calendar date) { return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.NE); } public static TupleCriterion ne(String columnName, Calendar date) { return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.NE); } public static TupleCriterion ne(final String column, final boolean trueFalse) { return new TupleCriterion() { private static final long serialVersionUID = 3652448730224390852L; @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { return tuple.getBoolean(column) != trueFalse; } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; } /** * compare if two column's values are not equals. */ public static TupleCriterion neColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.NE); } /** * greater than */ public static TupleCriterion gt(String columnName, String value) { return new StringCriterion(columnName, value, RelationalOperator.GT); } public static TupleCriterion gt(String columnName, Number value) { return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GT); } public static TupleCriterion gt(String columnName, String columnFormat, Date date) { return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.GT); } public static TupleCriterion gt(String columnName, Date date) { return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.GT); } public static TupleCriterion gt(String columnName, String columnFormat, Calendar date) { return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.GT); } public static TupleCriterion gt(String columnName, Calendar date) { return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.GT); } /** * compare if column1's value greater than column2's value */ public static TupleCriterion gtColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.GT); } /** * greater than or equal */ public static TupleCriterion ge(String columnName, String value) { return new StringCriterion(columnName, value, RelationalOperator.GE); } public static TupleCriterion ge(String columnName, Number value) { return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GE); } public static TupleCriterion ge(String columnName, String columnFormat, Date date) { return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.GE); } public static TupleCriterion ge(String columnName, Date date) { return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.GE); } public static TupleCriterion ge(String columnName, String columnFormat, Calendar date) { return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.GE); } public static TupleCriterion ge(String columnName, Calendar date) { return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.GE); } /** * compare if column1's value greater or equals to column2's value */ public static TupleCriterion geColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.GE); } /** * less than or equal */ public static TupleCriterion le(String columnName, String value) { return new StringCriterion(columnName, value, RelationalOperator.LE); } public static TupleCriterion le(String columnName, Number value) { return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LE); } public static TupleCriterion le(String columnName, String columnFormat, Date date) { return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.LE); } public static TupleCriterion le(String columnName, Date date) { return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.LE); } public static TupleCriterion le(String columnName, String columnFormat, Calendar date) { return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.LE); } public static TupleCriterion le(String columnName, Calendar date) { return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.LE); } /** * compare if column1's value less than column2's value */ public static TupleCriterion leColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.LE); } /** * less than */ public static TupleCriterion lt(String columnName, String value) { return new StringCriterion(columnName, value, RelationalOperator.LT); } public static TupleCriterion lt(String columnName, Number value) { return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LT); } public static TupleCriterion lt(String columnName, String columnFormat, Date date) { return new DateCriterion(columnName, columnFormat, date.getTime(), RelationalOperator.LT); } public static TupleCriterion lt(String columnName, Date date) { return new DateCriterion(columnName, null, date.getTime(), RelationalOperator.LT); } public static TupleCriterion lt(String columnName, String columnFormat, Calendar date) { return new DateCriterion(columnName, columnFormat, date.getTimeInMillis(), RelationalOperator.LT); } public static TupleCriterion lt(String columnName, Calendar date) { return new DateCriterion(columnName, null, date.getTimeInMillis(), RelationalOperator.LT); } /** * compare if column1's value less or equals to column2's value */ public static TupleCriterion ltColumns(final String column1, final String column2) { return new ColumnsCriterion(column1, column2, RelationalOperator.LT); } }