Java tutorial
/* * Copyright 2009-2015 DigitalGlobe, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package org.mrgeo.mapalgebra; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.mrgeo.format.InlineCsvInputFormat; import org.mrgeo.hdfs.utils.HadoopFileUtils; import org.mrgeo.hdfs.vector.Column; import org.mrgeo.hdfs.vector.ColumnDefinitionFile; import org.mrgeo.hdfs.vector.Column.FactorType; import org.mrgeo.mapalgebra.parser.ParserAdapter; import org.mrgeo.mapalgebra.parser.ParserNode; import org.mrgeo.geometry.Geometry; import org.mrgeo.mapreduce.job.JobCancelledException; import org.mrgeo.mapreduce.job.JobFailedException; import org.mrgeo.progress.Progress; import org.mrgeo.progress.ProgressHierarchy; import org.mrgeo.utils.HadoopUtils; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import java.util.Vector; public class SplitVectorMapOp extends VectorMapOp { public static final String SPLIT_TYPE_TEST = "test"; public static final String SPLIT_TYPE_TRAINING = "training"; private int splitCount; private int currentSplit; private String splitType; public static String[] register() { return new String[] { "SplitVector" }; } @Override public void addInput(MapOp n) throws IllegalArgumentException { if (_inputs.size() == 0) { if (!(n instanceof VectorMapOp)) { throw new IllegalArgumentException("The first parameter must be a vector input."); } _inputs.add(n); } else { throw new IllegalArgumentException("Only one input is supported."); } } private void determineOutputForInlineCsvInput(InlineCsvInputFormatDescriptor ifd, char delim) throws IOException { // Set up a reader to be able to stream features from the input source InlineCsvInputFormat.InlineCsvReader csvReader = new InlineCsvInputFormat.InlineCsvReader(); csvReader.initialize(ifd._columns, ifd._values); ColumnDefinitionFile inputCDF = csvReader.getColumnDefinitionFile(); FileSystem dfs = HadoopFileUtils.getFileSystem(new Path(_outputName)); FSDataOutputStream os = dfs.create(new Path(_outputName), true); PrintWriter pw = new PrintWriter(new java.io.OutputStreamWriter(os)); try { long lineNumber = 0; boolean isSplit = splitType.equalsIgnoreCase("test"); while (csvReader.nextFeature()) { Geometry feature = csvReader.getCurrentFeature(); if ((lineNumber % splitCount) == (currentSplit - 1)) { if (isSplit) { String strFeature = featureToString(inputCDF, feature, delim); pw.println(strFeature); } } else { if (!isSplit) { String strFeature = featureToString(inputCDF, feature, delim); pw.println(strFeature); } } lineNumber++; } } finally { pw.close(); if (os != null) { os.close(); } } // Copy the input columns to the output columns, excluding the stats // because we've filtered the actual data, so the stats will be wrong. Path outputColumnsPath = new Path(_outputName + ".columns"); ColumnDefinitionFile outputCDF = new ColumnDefinitionFile(); Vector<Column> columns = inputCDF.getColumns(); Vector<Column> newColumns = new Vector<Column>(); for (Column column : columns) { newColumns.add(new Column(column.getName(), column.getType())); } outputCDF.setColumns(newColumns); outputCDF.store(outputColumnsPath); _output = new BasicInputFormatDescriptor(_outputName); } private static String featureToString(ColumnDefinitionFile cdf, Geometry feature, char delim) { StringBuffer sb = new StringBuffer(); Vector<Column> columns = cdf.getColumns(); for (Column column : columns) { Object value = feature.getAttribute(column.getName()); if (sb.length() > 0) { sb.append(delim); } if (column.getType() != FactorType.Numeric) { sb.append('\"'); } sb.append(value.toString()); if (column.getType() != FactorType.Numeric) { sb.append('\"'); } } return sb.toString(); } @Override public String resolveOutputName() throws IOException { if (_outputName == null) { MapOp inputMapOp = _inputs.get(0); String outputBase = HadoopUtils.createRandomString(40); Path outputParent = HadoopFileUtils.getTempDir(); if (inputMapOp instanceof InlineCsvMapOp) { _outputName = new Path(outputParent, outputBase + ".csv").toString(); addTempFile(_outputName); return _outputName; } else { // Must be a VectorMapOp Path inputPath = new Path(((VectorMapOp) inputMapOp).getOutputName()); if (inputPath.toString().endsWith(".tsv")) { _outputName = new Path(outputParent, outputBase + ".tsv").toString(); } else if (inputPath.toString().endsWith(".csv")) { _outputName = new Path(outputParent, outputBase + ".csv").toString(); } else { throw new IOException("Unable to split input: " + inputPath.toString()); } addTempFile(_outputName); } } return _outputName; } @Override public void build(Progress p) throws IOException, JobFailedException, JobCancelledException { ProgressHierarchy ph = new ProgressHierarchy(p); ph.createChild(1.0f); ph.createChild(1.0f); MapOp inputMapOp = _inputs.get(0); Path inputPath = null; // TODO: // The following code is an ugly hack until we have time to re-factor the direct // reading of vector data. Right now, there is generic code for doing this (see // AutoFeatureInputFormat), but it's tightly coupled to the map/reduce InputFormat // and splits. We need to re-factor it so that the core part for reading the // vector data is independent of InputFormat. This represents a first step in that // direction where InlineCsvInputFormat itself was re-factored. That's why there's // a special case below - because the other vector formats have not been re-factored // and there is no generic interface in place for reading any vector data. if (inputMapOp instanceof InlineCsvMapOp) { InlineCsvInputFormatDescriptor ifd = (InlineCsvInputFormatDescriptor) ((VectorMapOp) inputMapOp) .getVectorOutput(); determineOutputForInlineCsvInput(ifd, ','); return; } else if (inputMapOp instanceof VectorMapOp) { inputPath = new Path(((VectorMapOp) inputMapOp).getOutputName()); } else { // defensive code since input should be VectorMapOp - see addInput() throw new IllegalArgumentException("Invalid value for vector argument to SplitVector"); } // SplitVectorDriver svd = new SplitVectorDriver(); // svd.run(getConf(), inputPath, splitCount, currentSplit, splitType, _outputName, p, jobListener); FileSystem dfs = HadoopFileUtils.getFileSystem(inputPath); if (!dfs.exists(inputPath)) { throw new IOException( String.format("Cannot read contrast measures, %s does not exist", inputPath.toString())); } FileStatus[] outputFiles = dfs.listStatus(inputPath); List<Path> tsvFiles = new ArrayList<Path>(); if (dfs.isFile(inputPath)) { tsvFiles.add(inputPath); } else { for (FileStatus fileStatus : outputFiles) { if (fileStatus.isDir() == false) { Path fp = fileStatus.getPath(); String name = fp.getName(); if (name.startsWith("part-")) { tsvFiles.add(fp); } } } } FSDataOutputStream os = dfs.create(new Path(_outputName), true); java.io.PrintWriter pw = new java.io.PrintWriter(new java.io.OutputStreamWriter(os)); try { long lineNumber = 0; boolean isTestSplit = splitType.equalsIgnoreCase("test"); for (Path tsvFile : tsvFiles) { InputStream is = HadoopFileUtils.open(tsvFile); // dfs.open(tsvFile); java.io.BufferedReader r = new java.io.BufferedReader(new java.io.InputStreamReader(is)); try { String line; while ((line = r.readLine()) != null) { if ((lineNumber % splitCount) == (currentSplit - 1)) { if (isTestSplit) { pw.println(line); } } else { if (!isTestSplit) { pw.println(line); } } lineNumber++; } } finally { r.close(); if (is != null) { is.close(); } } } } finally { pw.close(); if (os != null) { os.close(); } } // Copy the input columns to the output columns, excluding the stats // because we've filtered the actual data, so the stats will be wrong. Path inputColumnsPath = new Path(inputPath.toString() + ".columns"); Path outputColumnsPath = new Path(_outputName + ".columns"); ColumnDefinitionFile inputCDF = new ColumnDefinitionFile(inputColumnsPath); ColumnDefinitionFile outputCDF = new ColumnDefinitionFile(); Vector<Column> columns = inputCDF.getColumns(); Vector<Column> newColumns = new Vector<Column>(); for (Column column : columns) { newColumns.add(new Column(column.getName(), column.getType())); } outputCDF.setColumns(newColumns); outputCDF.store(outputColumnsPath); _output = new BasicInputFormatDescriptor(_outputName); } @Override public void moveOutput(String toName) throws IOException { super.moveOutput(toName); _outputName = toName; _output = new BasicInputFormatDescriptor(_outputName); } @Override public Vector<ParserNode> processChildren(final Vector<ParserNode> children, final ParserAdapter parser) { Vector<ParserNode> result = new Vector<ParserNode>(); if (children.size() != 4) { throw new IllegalArgumentException( "SplitVector usage: SplitVector(<vector source>, splitCount, currentSplit, splitType"); } // Validate splitCount splitCount = parseChildInt(children.get(1), "splitCount", parser); if (splitCount <= 1) { throw new IllegalArgumentException("splitCount must be > 1"); } // Validate currentSplit currentSplit = parseChildInt(children.get(2), "currentSplit", parser); if (currentSplit < 1 || currentSplit > splitCount) { throw new IllegalArgumentException("currentSplit must be >= 1 and <= splitCount"); } // Validate splitType, and convert from mixed case for the back-end splitType = parseChildString(children.get(3), "splitType", parser); if (splitType.equalsIgnoreCase(SPLIT_TYPE_TEST)) { splitType = SPLIT_TYPE_TEST; } else if (splitType.equalsIgnoreCase(SPLIT_TYPE_TRAINING)) { splitType = SPLIT_TYPE_TRAINING; } else { throw new IllegalArgumentException("splitType must be either \"test\" or \"training\""); } result.add(children.get(0)); return result; } @Override public String toString() { return String.format("SplitVectorMapOp %s", _outputName == null ? "null" : _outputName); } }