com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor.java
Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.util.List;
import java.util.Random;
import java.util.Iterator;
import java.util.ArrayList;

import java.io.File;
import java.io.DataOutput;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumReader;

import com.cloudera.recordbreaker.learnstructure.InferredType;
import com.cloudera.recordbreaker.learnstructure.LearnStructure;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/************************************************************************
 * <code>UnknownTextSchemaDescriptor</code> returns schema data that we
 * figure out from the data itself.
 *
 * @author "Michael Cafarella"
 * @version 1.0
 * @since 1.0
 * @see SchemaDescriptor
 *************************************************************************/
public class UnknownTextSchemaDescriptor extends GenericSchemaDescriptor {
    private static final Log LOG = LogFactory.getLog(UnknownTextSchemaDescriptor.class);
    public static String SCHEMA_ID = "recordbreaker-recovered";
    public static int MAX_LINES = 1000;
    InferredType typeTree;

    public UnknownTextSchemaDescriptor(DataDescriptor dd) throws IOException {
        super(dd);
        // Superclass calls computeSchema()
    }

    public UnknownTextSchemaDescriptor(DataDescriptor dd, String schemaRepr, byte[] miscPayload)
            throws IOException {
        super(dd, schemaRepr);
        this.randId = new Random().nextInt();

        // Deserialize the payload string into the parser
        DataInputStream in = new DataInputStream(new ByteArrayInputStream(miscPayload));
        try {
            this.typeTree = InferredType.readType(in);
        } finally {
            in.close();
        }
    }

    int randId;

    void computeSchema() throws IOException {
        this.randId = new Random().nextInt();
        LearnStructure ls = new LearnStructure();
        FileSystem fs = FSAnalyzer.getInstance().getFS();
        FileSystem localFS = FileSystem.getLocal(new Configuration());
        Path inputPath = dd.getFilename();

        File workingParserFile = File.createTempFile("textdesc", "typetree", null);
        File workingSchemaFile = File.createTempFile("textdesc", "schema", null);

        ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()),
                new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES);

        this.schema = Schema.parse(workingSchemaFile);
        DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath())));
        try {
            this.typeTree = InferredType.readType(in);
        } catch (IOException iex) {
            iex.printStackTrace();
            throw iex;
        } finally {
            in.close();
        }
        //System.err.println("Recovered unknowntext schema: " + schema);
    }

    public byte[] getPayload() {
        // Serialize the parser, return the resulting string
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        DataOutputStream out = new DataOutputStream(baos);
        try {
            this.typeTree.write(out);
            out.close();
        } catch (IOException iex) {
            return new byte[0];
        } finally {
        }
        return baos.toByteArray();
    }

    /**
     * Iterate through Avro-encoded rows of the file
     */
    public Iterator getIterator() {
        return new Iterator() {
            int lineno = 0;
            BufferedReader in = null;
            Object nextElt = null;
            {
                try {
                    in = new BufferedReader(new InputStreamReader(dd.getRawBytes()));
                    nextElt = lookahead();
                } catch (IOException iex) {
                    LOG.info("iex: " + iex.toString());
                    nextElt = null;
                }
            }

            public boolean hasNext() {
                return nextElt != null;
            }

            public synchronized Object next() {
                Object toReturn = nextElt;
                nextElt = lookahead();
                return toReturn;
            }

            public void remove() {
                throw new UnsupportedOperationException();
            }

            Object lookahead() {
                try {
                    String str = null;
                    while ((str = in.readLine()) != null) {
                        GenericContainer resultObj = typeTree.parse(str);
                        lineno++;
                        if (resultObj != null) {
                            return resultObj;
                        }
                    }
                    if (in != null) {
                        in.close();
                        in = null;
                    }
                } catch (IOException iex) {
                    iex.printStackTrace();
                }
                return null;
            }
        };
    }

    /**
     * @return a <code>String</code> that annotates the schema
     */
    public String getSchemaSourceDescription() {
        return SCHEMA_ID;
    }
}