com.intel.hadoop.hbase.dot.doc.serializer.AvroData.java Source code

Introduction

Here is the source code for com.intel.hadoop.hbase.dot.doc.serializer.AvroData.java
Source

/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.intel.hadoop.hbase.dot.doc.serializer;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Writable;

import com.intel.hadoop.hbase.dot.DotUtil;
import com.intel.hadoop.hbase.dot.doc.DocSchemaMissMatchException;
import com.intel.hadoop.hbase.dot.doc.Document;
import com.intel.hadoop.hbase.dot.doc.Document.DocSchemaField;

class AvroData {
    public Schema schema = null;
    List<DocSchemaField> fields = null;
}

/**
 * Avro encoder and decoder for document
 * TODO: should catch all avro exceptions and re-throw an IOException
 */
public class AvroDoc extends Document implements Writable {
    private static final Log LOG = LogFactory.getLog(AvroDoc.class);
    private DecoderFactory decoderfactory = new DecoderFactory();
    private EncoderFactory encoderfactory = new EncoderFactory();
    private Schema schema = null;
    private Encoder encoder = null;
    private Decoder decoder = null;
    private List<DocSchemaField> fields = null;

    private ByteArrayOutputStream out = null;

    private GenericRecord record = null;

    @Override
    protected AvroData _parseSchema(String schema) {
        AvroData result = new AvroData();
        Schema s = Schema.parse(schema);

        byte[] doc = s.getName().getBytes();

        List<Schema.Field> avroFields = s.getFields();
        result.fields = new ArrayList<DocSchemaField>(avroFields.size());

        // TODO Fixme, should sort on name string directly. 
        Set<byte[]> sortFields = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
        for (Schema.Field f : avroFields) {
            sortFields.add(f.name().getBytes());
        }

        for (byte[] field : sortFields) {
            DocSchemaField d = new DocSchemaField();
            d.name = new String(field);
            d.field = field;
            d.docWithField = DotUtil.combineDocAndField(doc, field);
            result.fields.add(d);
        }

        result.schema = s;
        return result;
    }

    protected AvroData _parseSchema(Path schemafile) {
        return null;
    }

    @Override
    public void loadSchema(String schema) throws IOException {
        // TODO check if the primitive type is bytes only
        this.schema = Schema.parse(schema);
    }

    public void loadSchema(Object data) throws IOException {
        if (data == null)
            return;
        if (data instanceof AvroData) {
            this.schema = ((AvroData) data).schema;
            this.fields = ((AvroData) data).fields;
        }
    }

    @Override
    public void loadSchema(Path schemafile) throws IOException {

        // TODO check if the primitive type is bytes only
        // class file

        // avpr file
    }

    @Override
    public Object getSchema() throws IOException {
        return schema;
    }

    @Override
    public String getSchemaInJSON() throws IOException {
        return schema.toString();
    }

    @Override
    public Object getEncoder() throws IOException {
        return this.encoder;
    }

    byte[] getValue(String field) {
        Object value = this.record.get(field);
        if (null != value) {
            ByteBuffer buf = (ByteBuffer) value;
            // when reusing record, ByteBuffer size could be larger than actual data
            // need to check the limit and return byte[] only contain the real data.
            return buf.limit() == buf.capacity() ? buf.array() : Bytes.getBytes(buf);
        }
        return null;
    }

    @Override
    public byte[] getValue(byte[] field) {
        return getValue(new String(field));
    }

    @Override
    public List<byte[]> getValues() {
        List<byte[]> result = new ArrayList<byte[]>(this.fields.size());
        for (DocSchemaField d : this.fields) {
            result.add(getValue(d.name));
        }
        return result;
    }

    @Override
    public void setValue(byte[] field, byte[] value) throws IOException {
        try {
            this.record.put(new String(field), ByteBuffer.wrap(value));
        } catch (AvroRuntimeException are) {
            throw new DocSchemaMissMatchException(DocSchemaMissMatchException.INVALID_DOC_OR_FIELD);
        } catch (NullPointerException npe) {
            throw new DocSchemaMissMatchException(DocSchemaMissMatchException.INVALID_DOC_OR_FIELD);
        } catch (Exception e) {
            LOG.info("Unexpected exception: " + e);
            LOG.info(e.getStackTrace());
            throw new IOException(e);
        }
    }

    @Override
    public byte[] getDoc() throws IOException {
        if (this.record == null || this.schema == null || this.encoder == null || this.out == null) {
            return null;
        } else {
            // LOG.info("record: " + this.record.toString());
            GenericDatumWriter writer = new GenericDatumWriter(this.schema);
            writer.write(this.record, this.encoder);
            this.encoder.flush();
            return this.out.toByteArray();
        }
    }

    @Override
    public List<DocSchemaField> getFields() throws IOException {
        return fields;
    }

    @Override
    public boolean allValueInitialized() throws IOException {
        boolean isInitialized = true;
        for (Schema.Field f : this.schema.getFields()) {
            // LOG.info("default: " + f.defaultValue());
            String name = f.name();
            if (this.record.get(name) == null) {
                this.record.put(name, ByteBuffer.wrap(this.nullStr.getBytes()));
                isInitialized = false;
                // Don't early out, so that all empty fields will be filled with nullStr
            }
        }
        return isInitialized;
    }

    @Override
    public boolean allFieldsIncluded(List<byte[]> fieldlist) throws IOException {
        boolean isAllIncluded = true;
        if (this.schema.getFields().size() == fieldlist.size()) {
            for (byte[] f : fieldlist) {
                if (null == this.schema.getField(new String(f))) {
                    isAllIncluded = false;
                    break;
                }
            }
        } else {
            isAllIncluded = false;
        }
        return isAllIncluded;
    }

    @Override
    public Object getDecoder() throws IOException {
        return this.decoder;
    }

    @Override
    public void setDoc(byte[] data) throws IOException {
        setDoc(data, 0, data.length);
    }

    @Override
    public void setDoc(byte[] buffer, int offset, int len) throws IOException {
        this.decoder = decoderfactory.binaryDecoder(buffer, offset, len, (BinaryDecoder) this.decoder);
        GenericDatumReader<GenericRecord> recordreader = new GenericDatumReader<GenericRecord>(this.schema);
        this.record = (GenericRecord) recordreader.read(this.record, this.decoder);
    }

    @Override
    public void setEncoder(Object encoder) throws IOException {
        this.encoder = (Encoder) encoder;
    }

    @Override
    public void setDecoder(Object decoder) throws IOException {
        this.decoder = (Decoder) decoder;
    }

    @Override
    public OutputStream getOutputStream() throws IOException {
        return this.out;
    }

    @Override
    public void setOutputStream(OutputStream out) throws IOException {
        this.out = (ByteArrayOutputStream) out;
    }

    @Override
    public void initialize(Path schemafile, byte[] data) throws IOException {
        //TODO
    }

    @Override
    public void initialize(String schema, byte[] data) throws IOException {
        initialize(schema, data, (Decoder) null);
    }

    @Override
    public void initialize(Path schema, byte[] data, Object decoder) throws IOException {
        //TODO
    }

    @Override
    public void initialize(String schema, OutputStream out) throws IOException {
        initialize(schema, (Encoder) null, out);
    }

    @Override
    public void initialize(Path schemafile, OutputStream out) throws IOException {
        //TODO
    }

    @Override
    public void initialize(Path schemafile, Object encoder, OutputStream out) throws IOException {
        //TODO
    }

    public void initialize(String schema, byte[] data, Object decoder) throws IOException {
        // TODO Auto-generated constructor stub
        loadSchema(schema);
        setDecoder(decoder);
        setDoc(data);
    }

    public void initialize(String schema, Object encoder, OutputStream out) throws IOException {
        loadSchema(schema);
        record = new GenericData.Record(this.schema);
        setEncoder(encoder);
        setOutputStream(out);
        if (this.out == null)
            this.out = new ByteArrayOutputStream();
        if (null == encoder) {
            // this.encoder = encoderfactory.jsonEncoder(this.schema, this.out);
            this.encoder = encoderfactory.binaryEncoder(out, null);
        }
    }

    @Override
    public void initialize(Object schema, byte[] data, Object decoder) throws IOException {
        loadSchema(schema);
        setDecoder(decoder);
        setDoc(data);
    }

    @Override
    public void initialize(Object schema, Object encoder, OutputStream out) throws IOException {

        loadSchema(schema);
        record = new GenericData.Record(this.schema);

        setEncoder(encoder);
        setOutputStream(out);
        if (this.out == null)
            this.out = new ByteArrayOutputStream();
        if (null == encoder) {
            // this.encoder = encoderfactory.jsonEncoder(this.schema, this.out);
            this.encoder = encoderfactory.binaryEncoder(out, null);
        }
    }

    /**
     * DOC|AVRO|DATA_LEN|DATA_BYTE_ARRAY
     *
     *
     */
    @Override
    public void write(DataOutput out) throws IOException {
        //TODO
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        //TODO
    }

}