Java tutorial
/** * Copyright (C) 2007-2012 Hypertable, Inc. * * This file is part of Hypertable. * * Hypertable is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 3 * of the License, or any later version. * * Hypertable is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. */ package org.hypertable.hadoop.mapred; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.text.ParseException; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.hypertable.thriftgen.*; import org.hypertable.Common.Time; import org.hypertable.hadoop.mapreduce.ScanSpec; import org.hypertable.hadoop.mapred.TableSplit; import org.hypertable.thrift.ThriftClient; import org.apache.thrift.transport.TTransportException; import org.apache.thrift.TException; public class TextTableInputFormat implements org.apache.hadoop.mapred.InputFormat<Text, Text>, JobConfigurable { final Log LOG = LogFactory.getLog(InputFormat.class); public static final String NAMESPACE = "hypertable.mapreduce.namespace"; public static final String INPUT_NAMESPACE = "hypertable.mapreduce.input.namespace"; public static final String TABLE = "hypertable.mapreduce.input.table"; public static final String COLUMNS = "hypertable.mapreduce.input.scan_spec.columns"; public static final String VALUE_REGEXPS = "hypertable.mapreduce.input.scan_spec.value_regexps"; public static final String COLUMN_PREDICATES = "hypertable.mapreduce.input.scan_spec.column_predicates"; public static final String OPTIONS = "hypertable.mapreduce.input.scan_spec.options"; public static final String ROW_INTERVAL = "hypertable.mapreduce.input.scan_spec.row_interval"; public static final String TIMESTAMP_INTERVAL = "hypertable.mapreduce.input.scan_spec.timestamp_interval"; public static final String INCLUDE_TIMESTAMPS = "hypertable.mapreduce.input.include_timestamps"; public static final String NO_ESCAPE = "hypertable.mapreduce.input.no_escape"; public static final String THRIFT_FRAMESIZE = "hypertable.mapreduce.thriftbroker.framesize"; public static final String THRIFT_FRAMESIZE2 = "hypertable.mapreduce.thriftclient.framesize"; private ThriftClient m_client = null; private ScanSpec m_base_spec = null; private String m_namespace = null; private String m_tablename = null; private boolean m_include_timestamps = false; private boolean m_no_escape = false; private static String stripQuotes(String str) { if (str != null && str.length() > 0) { if ((str.charAt(0) == '\'' && str.charAt(str.length() - 1) == '\'') || (str.charAt(0) == '"' && str.charAt(str.length() - 1) == '"')) return str.substring(1, str.length() - 1); } return str; } public void parseOptions(JobConf job) throws ParseException { String str = job.get(OPTIONS); if (str != null) { String[] strs = str.split("\\s"); for (int i = 0; i < strs.length; i++) { strs[i] = strs[i].toUpperCase(); if (strs[i].equals("MAX_VERSIONS") || strs[i].equals("REVS")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setVersions(value); } else if (strs[i].equals("OFFSET")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setRow_offset(value); } else if (strs[i].equals("CELL_OFFSET")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setCell_offset(value); } else if (strs[i].equals("LIMIT")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setRow_limit(value); } else if (strs[i].equals("CELL_LIMIT")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setCell_limit(value); } else if (strs[i].equals("CELL_LIMIT_PER_FAMILY")) { i++; if (i == strs.length) throw new ParseException("Bad OPTIONS spec", i); int value = Integer.parseInt(strs[i]); m_base_spec.setCell_limit_per_family(value); } else if (strs[i].equals("KEYS_ONLY")) { m_base_spec.setKeys_only(true); } else if (strs[i].equals("RETURN_DELETES")) { m_base_spec.setReturn_deletes(true); } else throw new ParseException("Bad OPTIONS spec: " + strs[i], i); } } } public void parseColumns(JobConf job) { String str = job.get(COLUMNS); if (str != null) { String[] columns = str.split(","); for (int i = 0; i < columns.length; i++) m_base_spec.addToColumns(stripQuotes(columns[i])); } } public void parseValueRegexps(JobConf job) { String str = job.get(VALUE_REGEXPS); if (str != null) m_base_spec.setValue_regexp(stripQuotes(str)); } public void parseColumnPredicate(JobConf job) throws ParseException { ColumnPredicate cp = new ColumnPredicate(); String str = job.get(COLUMN_PREDICATES); if (str == null) return; int offset = str.indexOf("=^"); if (offset != -1) { cp.column_family = str.substring(0, offset).trim(); cp.operation = ColumnPredicateOperation.PREFIX_MATCH; cp.value = str.substring(offset + 2).trim(); } else { offset = str.indexOf("="); if (offset != -1) { cp.column_family = str.substring(0, offset).trim(); cp.operation = ColumnPredicateOperation.EXACT_MATCH; cp.value = str.substring(offset + 1).trim(); } else throw new ParseException("Invalid COLUMN_PREDICATE: " + str, 0); } m_base_spec.addToColumn_predicates(cp); } public String[] parseRelopSpec(String str, String name) { String name_uppercase = name.toUpperCase(); String name_lowercase = name.toLowerCase(); String[] strs = new String[5]; int ts_offset = str.indexOf(name_uppercase); if (ts_offset == -1) ts_offset = str.indexOf(name_lowercase); if (ts_offset == -1) return null; strs[0] = str.substring(0, ts_offset).trim(); strs[2] = str.substring(ts_offset, ts_offset + name.length()); strs[4] = str.substring(ts_offset + name.length()).trim(); if (strs[0].length() > 0) { int offset = strs[0].length() - 1; while (offset > 0 && (strs[0].charAt(offset) == '<' || strs[0].charAt(offset) == '=' || strs[0].charAt(offset) == '>')) offset--; if (offset == -1 || offset == strs[0].length() - 1) return null; strs[1] = strs[0].substring(offset + 1); strs[0] = strs[0].substring(0, offset).trim(); } if (strs[4].length() > 0) { int offset = 0; while (offset < strs[4].length() && (strs[4].charAt(offset) == '<' || strs[4].charAt(offset) == '=' || strs[4].charAt(offset) == '>')) offset++; if (offset == strs[4].length() || offset == 0) return null; strs[3] = strs[4].substring(0, offset); strs[4] = strs[4].substring(offset).trim(); } if (strs[0].length() == 0 && strs[4].length() == 0) return null; if (strs[0].length() == 0) { if (strs[3].equals(">") || strs[3].equals(">=")) { strs[0] = strs[4]; if (strs[3].equals(">")) strs[1] = "<="; else if (strs[3].equals(">=")) strs[1] = "<"; strs[3] = null; strs[4] = null; } } else if (strs[4].length() == 0) { if (strs[1].equals(">") || strs[1].equals(">=")) { strs[4] = strs[0]; if (strs[1].equals(">")) strs[3] = "<="; else if (strs[1].equals(">=")) strs[3] = "<"; strs[1] = null; strs[0] = null; } } else { if (strs[1].equals(">") || strs[1].equals(">=")) { if (!strs[3].equals(">") && !strs[3].equals(">=")) return null; String tmp = strs[0]; strs[0] = strs[4]; strs[4] = tmp; if (strs[1].equals(">")) strs[1] = "<="; else if (strs[1].equals(">=")) strs[1] = "<"; if (strs[3].equals(">")) strs[3] = "<="; else if (strs[3].equals(">=")) strs[3] = "<"; } } if (strs[1] != null && strs[1].equals("=") && strs[3] != null && strs[3].equals("=")) return null; strs[0] = stripQuotes(strs[0]); strs[4] = stripQuotes(strs[4]); return strs; } public void parseTimestampInterval(JobConf job) throws ParseException { String str = job.get(TIMESTAMP_INTERVAL); if (str != null) { Date ts; long epoch_time; String[] parsedRelop = parseRelopSpec(str, "TIMESTAMP"); if (parsedRelop == null) throw new ParseException("Invalid TIMESTAMP interval: " + str, 0); if (parsedRelop[0] != null && parsedRelop[0].length() > 0) { ts = Time.parse_ts(parsedRelop[0]); epoch_time = ts.getTime() * 1000000; m_base_spec.setStart_time(epoch_time); if (parsedRelop[1].equals("=")) m_base_spec.setEnd_time(epoch_time); } if (parsedRelop[4] != null && parsedRelop[4].length() > 0) { ts = Time.parse_ts(parsedRelop[4]); epoch_time = ts.getTime() * 1000000; m_base_spec.setEnd_time(epoch_time); if (parsedRelop[3].equals("=")) m_base_spec.setStart_time(epoch_time); } } } public void parseRowInterval(JobConf job) throws ParseException { String str = job.get(ROW_INTERVAL); if (str != null) { Date ts; long epoch_time; String[] parsedRelop = parseRelopSpec(str, "ROW"); RowInterval interval = new RowInterval(); if (parsedRelop == null) throw new ParseException("Invalid ROW interval: " + str, 0); if (parsedRelop[0] != null && parsedRelop[0].length() > 0) { interval.setStart_row(parsedRelop[0]); interval.setStart_rowIsSet(true); if (parsedRelop[1].equals("<")) interval.setStart_inclusive(false); else if (parsedRelop[1].equals("<=")) interval.setStart_inclusive(true); else throw new ParseException("Invalid ROW interval, bad RELOP (" + parsedRelop[1] + ")", 0); interval.setStart_inclusiveIsSet(true); } if (parsedRelop[4] != null && parsedRelop[4].length() > 0) { interval.setEnd_row(parsedRelop[4]); interval.setEnd_rowIsSet(true); if (parsedRelop[3].equals("<")) interval.setEnd_inclusive(false); else if (parsedRelop[3].equals("<=")) interval.setEnd_inclusive(true); else throw new ParseException("Invalid ROW interval, bad RELOP (" + parsedRelop[3] + ")", 0); interval.setEnd_inclusiveIsSet(true); } if (interval.isSetStart_row() || interval.isSetEnd_row()) { m_base_spec.addToRow_intervals(interval); m_base_spec.setRow_intervalsIsSet(true); } } } public void configure(JobConf job) { m_include_timestamps = job.getBoolean(INCLUDE_TIMESTAMPS, false); m_no_escape = job.getBoolean(NO_ESCAPE, false); try { m_base_spec = new ScanSpec(); parseColumns(job); parseOptions(job); parseTimestampInterval(job); parseRowInterval(job); parseValueRegexps(job); parseColumnPredicate(job); System.out.println(m_base_spec); } catch (Exception e) { e.printStackTrace(); System.exit(-1); } } public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { try { TableSplit ts = (TableSplit) split; if (m_namespace == null) { m_namespace = job.get(INPUT_NAMESPACE); if (m_namespace == null) m_namespace = job.get(NAMESPACE); } if (m_tablename == null) m_tablename = job.get(TABLE); ScanSpec scan_spec = ts.createScanSpec(m_base_spec); System.out.println(scan_spec); if (m_client == null) { int framesize = job.getInt(THRIFT_FRAMESIZE, 0); if (framesize == 0) framesize = job.getInt(THRIFT_FRAMESIZE2, 0); if (framesize != 0) m_client = ThriftClient.create("localhost", 38080, 1600000, true, framesize); else m_client = ThriftClient.create("localhost", 38080); } return new HypertableRecordReader(m_client, m_namespace, m_tablename, scan_spec, m_include_timestamps, m_no_escape); } catch (TTransportException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } catch (TException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } } public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { long ns = 0; try { RowInterval ri = null; if (m_client == null) { int framesize = job.getInt(THRIFT_FRAMESIZE, 0); if (framesize == 0) framesize = job.getInt(THRIFT_FRAMESIZE2, 0); if (framesize != 0) m_client = ThriftClient.create("localhost", 38080, 1600000, true, framesize); else m_client = ThriftClient.create("localhost", 38080); } String tablename = job.get(TABLE); String namespace = job.get(INPUT_NAMESPACE); if (namespace == null) namespace = job.get(NAMESPACE); java.util.Iterator<RowInterval> iter = m_base_spec.getRow_intervalsIterator(); if (iter != null && iter.hasNext()) { ri = iter.next(); if (iter.hasNext()) { System.out.println("InputFormat only allows a single ROW interval"); System.exit(-1); } } ns = m_client.namespace_open(namespace); List<org.hypertable.thriftgen.TableSplit> tsplits = m_client.table_get_splits(ns, tablename); List<InputSplit> splits = new ArrayList<InputSplit>(tsplits.size()); for (final org.hypertable.thriftgen.TableSplit ts : tsplits) { if (ri == null || ((!ri.isSetStart_row() || ts.end_row == null || ts.end_row.compareTo(ri.getStart_row()) > 0 || (ts.end_row.compareTo(ri.getStart_row()) == 0 && ri.isStart_inclusive())) && (!ri.isSetEnd_row() || ts.start_row == null || ts.start_row.compareTo(ri.getEnd_row()) < 0))) { byte[] start_row = (ts.start_row == null) ? null : ts.start_row.getBytes("UTF-8"); byte[] end_row = (ts.end_row == null) ? null : ts.end_row.getBytes("UTF-8"); TableSplit split = new TableSplit(tablename.getBytes("UTF-8"), start_row, end_row, ts.hostname); splits.add(split); } } InputSplit[] isplits = new InputSplit[splits.size()]; return splits.toArray(isplits); } catch (TTransportException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } catch (TException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } catch (ClientException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new IOException(e.getMessage()); } finally { if (ns != 0) { try { m_client.namespace_close(ns); } catch (Exception e) { e.printStackTrace(); throw new IOException(e.getMessage()); } } } } }