com.mongodb.hadoop.pig.MongoStorage.java Source code

Java tutorial

Introduction

Here is the source code for com.mongodb.hadoop.pig.MongoStorage.java

Source

/*
 * Copyright 2011 10gen Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mongodb.hadoop.pig;

import com.mongodb.*;
import com.mongodb.hadoop.*;
import com.mongodb.hadoop.output.*;
import com.mongodb.hadoop.util.*;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.pig.*;
import org.apache.pig.data.*;
import org.apache.pig.impl.util.*;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;

import java.io.*;
import java.text.ParseException;
import java.util.*;

public class MongoStorage extends StoreFunc implements StoreMetadata {

    private static final Log log = LogFactory.getLog(MongoStorage.class);
    // Pig specific settings
    static final String PIG_OUTPUT_SCHEMA = "mongo.pig.output.schema";
    static final String PIG_OUTPUT_SCHEMA_UDF_CONTEXT = "mongo.pig.output.schema.udf_context";
    protected ResourceSchema schema = null;
    private final MongoStorageOptions options;

    public MongoStorage() {
        this.options = null;
    }

    /**
     * Takes a list of arguments of two types: 
     * <ul>
     * <li>A single set of keys to base updating on in the format:<br />
     * 'update [time, user]' or 'multi [timer, user] for multi updates</li>
     * 
     * <li>Multiple indexes to ensure in the format:<br />
     * '{time: 1, user: 1},{unique: true}'<br />
     * (The syntax is exactly like db.col.ensureIndex())</li>
     * </ul>
     * Example:<br />
     * STORE Result INTO '$db' USING com.mongodb.hadoop.pig.MongoStorage('update [time, servername, hostname]', '{time : 1, servername : 1, hostname : 1}, {unique:true, dropDups: true}').
     * @param args
     * @throws ParseException
     */
    public MongoStorage(String... args) throws ParseException {
        this.options = MongoStorageOptions.parseArguments(args);
    }

    public void checkSchema(ResourceSchema schema) throws IOException {
        log.info("checking schema " + schema.toString());
        this.schema = schema;
        final Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                new String[] { _udfContextSignature });
        properties.setProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT, schema.toString());
    }

    public void storeSchema(ResourceSchema schema, String location, Job job) {
        // not implemented
    }

    public void storeStatistics(ResourceStatistics stats, String location, Job job) {
        // not implemented
    }

    public void putNext(Tuple tuple) throws IOException {
        log.info("writing " + tuple.toString());
        final Configuration config = _recordWriter.getContext().getConfiguration();
        final List<String> schema = Arrays.asList(config.get(PIG_OUTPUT_SCHEMA).split(","));
        final BasicDBObjectBuilder builder = BasicDBObjectBuilder.start();

        ResourceFieldSchema[] fields = this.schema.getFields();
        for (int i = 0; i < fields.length; i++) {
            writeField(builder, fields[i], tuple.get(i));
        }

        log.info("writing out:" + builder.get().toString());
        _recordWriter.write(null, builder.get());
    }

    protected void writeField(BasicDBObjectBuilder builder, ResourceSchema.ResourceFieldSchema field, Object d)
            throws IOException {

        // If the field is missing or the value is null, write a null
        if (d == null) {
            builder.add(field.getName(), d);
            return;
        }

        ResourceSchema s = field.getSchema();

        // Based on the field's type, write it out
        switch (field.getType()) {
        case DataType.INTEGER:
            builder.add(field.getName(), (Integer) d);
            return;

        case DataType.LONG:
            builder.add(field.getName(), (Long) d);
            return;

        case DataType.FLOAT:
            builder.add(field.getName(), (Float) d);
            return;

        case DataType.DOUBLE:
            builder.add(field.getName(), (Double) d);
            return;

        case DataType.BYTEARRAY:
            builder.add(field.getName(), d.toString());
            return;

        case DataType.CHARARRAY:
            builder.add(field.getName(), (String) d);
            return;

        // Given a TUPLE, create a Map so BSONEncoder will eat it
        case DataType.TUPLE:
            if (s == null) {
                throw new IOException("Schemas must be fully specified to use "
                        + "this storage function.  No schema found for field " + field.getName());
            }
            ResourceSchema.ResourceFieldSchema[] fs = s.getFields();
            LinkedHashMap m = new java.util.LinkedHashMap();
            for (int j = 0; j < fs.length; j++) {
                m.put(fs[j].getName(), ((Tuple) d).get(j));
            }
            builder.add(field.getName(), (Map) m);
            return;

        // Given a BAG, create an Array so BSONEnconder will eat it.
        case DataType.BAG:
            if (s == null) {
                throw new IOException("Schemas must be fully specified to use "
                        + "this storage function.  No schema found for field " + field.getName());
            }
            fs = s.getFields();
            if (fs.length != 1 || fs[0].getType() != DataType.TUPLE) {
                throw new IOException("Found a bag without a tuple " + "inside!");
            }
            // Drill down the next level to the tuple's schema.
            s = fs[0].getSchema();
            if (s == null) {
                throw new IOException("Schemas must be fully specified to use "
                        + "this storage function.  No schema found for field " + field.getName());
            }
            fs = s.getFields();

            ArrayList a = new ArrayList<Map>();
            for (Tuple t : (DataBag) d) {
                LinkedHashMap ma = new java.util.LinkedHashMap();
                for (int j = 0; j < fs.length; j++) {
                    ma.put(fs[j].getName(), ((Tuple) t).get(j));
                }
                a.add(ma);
            }

            builder.add(field.getName(), a);
            return;
        case DataType.MAP:
            Map map = (Map) d;
            for (Object key : map.keySet()) {
                builder.add(key.toString(), map.get(key));
            }
            return;
        }
    }

    public void prepareToWrite(RecordWriter writer) throws IOException {

        _recordWriter = (MongoRecordWriter) writer;
        log.info("Preparing to write to " + _recordWriter);
        if (_recordWriter == null)
            throw new IOException("Invalid Record Writer");
        // Parse the schema from the string stored in the properties object.

        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { _udfContextSignature });

        String strSchema = p.getProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT);
        if (strSchema == null) {
            throw new IOException("Could not find schema in UDF context");
        }

        try {
            // Parse the schema from the string stored in the properties object.
            this.schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        } catch (Exception e) {
            e.printStackTrace();
        }

        if (options != null) {
            // If we are insuring any indexes do so now:
            for (MongoStorageOptions.Index in : options.getIndexes()) {
                _recordWriter.ensureIndex(in.index, in.options);
            }
        }
    }

    public OutputFormat getOutputFormat() throws IOException {
        final MongoOutputFormat outputFmt = options == null ? new MongoOutputFormat()
                : new MongoOutputFormat(options.getUpdate().keys, options.getUpdate().multi);
        return outputFmt;
    }

    public String relToAbsPathForStoreLocation(String location, org.apache.hadoop.fs.Path curDir)
            throws IOException {
        // Don't convert anything - override to keep base from messing with URI
        return location;
    }

    public void setStoreLocation(String location, Job job) throws IOException {
        final Configuration config = job.getConfiguration();
        log.info("Store Location Config: " + config + " For URI: " + location);
        if (!location.startsWith("mongodb://"))
            throw new IllegalArgumentException(
                    "Invalid URI Format.  URIs must begin with a mongodb:// protocol string.");
        MongoConfigUtil.setOutputURI(config, location);
        final Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                new String[] { _udfContextSignature });
        config.set(PIG_OUTPUT_SCHEMA, properties.getProperty(PIG_OUTPUT_SCHEMA_UDF_CONTEXT));
    }

    public void setStoreFuncUDFContextSignature(String signature) {
        _udfContextSignature = signature;
    }

    String _udfContextSignature = null;
    MongoRecordWriter _recordWriter = null;
}