Java tutorial
/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.analyzer; import java.util.List; import java.util.Random; import java.util.Iterator; import java.util.ArrayList; import java.io.File; import java.io.DataOutput; import java.io.IOException; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.InputStreamReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericDatumReader; import com.cloudera.recordbreaker.learnstructure.InferredType; import com.cloudera.recordbreaker.learnstructure.LearnStructure; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.conf.Configuration; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /************************************************************************ * <code>UnknownTextSchemaDescriptor</code> returns schema data that we * figure out from the data itself. * * @author "Michael Cafarella" * @version 1.0 * @since 1.0 * @see SchemaDescriptor *************************************************************************/ public class UnknownTextSchemaDescriptor extends GenericSchemaDescriptor { private static final Log LOG = LogFactory.getLog(UnknownTextSchemaDescriptor.class); public static String SCHEMA_ID = "recordbreaker-recovered"; public static int MAX_LINES = 1000; InferredType typeTree; public UnknownTextSchemaDescriptor(DataDescriptor dd) throws IOException { super(dd); // Superclass calls computeSchema() } public UnknownTextSchemaDescriptor(DataDescriptor dd, String schemaRepr, byte[] miscPayload) throws IOException { super(dd, schemaRepr); this.randId = new Random().nextInt(); // Deserialize the payload string into the parser DataInputStream in = new DataInputStream(new ByteArrayInputStream(miscPayload)); try { this.typeTree = InferredType.readType(in); } finally { in.close(); } } int randId; void computeSchema() throws IOException { this.randId = new Random().nextInt(); LearnStructure ls = new LearnStructure(); FileSystem fs = FSAnalyzer.getInstance().getFS(); FileSystem localFS = FileSystem.getLocal(new Configuration()); Path inputPath = dd.getFilename(); File workingParserFile = File.createTempFile("textdesc", "typetree", null); File workingSchemaFile = File.createTempFile("textdesc", "schema", null); ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()), new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES); this.schema = Schema.parse(workingSchemaFile); DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath()))); try { this.typeTree = InferredType.readType(in); } catch (IOException iex) { iex.printStackTrace(); throw iex; } finally { in.close(); } //System.err.println("Recovered unknowntext schema: " + schema); } public byte[] getPayload() { // Serialize the parser, return the resulting string ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(baos); try { this.typeTree.write(out); out.close(); } catch (IOException iex) { return new byte[0]; } finally { } return baos.toByteArray(); } /** * Iterate through Avro-encoded rows of the file */ public Iterator getIterator() { return new Iterator() { int lineno = 0; BufferedReader in = null; Object nextElt = null; { try { in = new BufferedReader(new InputStreamReader(dd.getRawBytes())); nextElt = lookahead(); } catch (IOException iex) { LOG.info("iex: " + iex.toString()); nextElt = null; } } public boolean hasNext() { return nextElt != null; } public synchronized Object next() { Object toReturn = nextElt; nextElt = lookahead(); return toReturn; } public void remove() { throw new UnsupportedOperationException(); } Object lookahead() { try { String str = null; while ((str = in.readLine()) != null) { GenericContainer resultObj = typeTree.parse(str); lineno++; if (resultObj != null) { return resultObj; } } if (in != null) { in.close(); in = null; } } catch (IOException iex) { iex.printStackTrace(); } return null; } }; } /** * @return a <code>String</code> that annotates the schema */ public String getSchemaSourceDescription() { return SCHEMA_ID; } }