com.twitter.pycascading.MetaScheme.java Source code

Introduction

Here is the source code for com.twitter.pycascading.MetaScheme.java
Source

/**
 * Copyright 2011 Twitter, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.twitter.pycascading;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;

/**
 * A Cascading Scheme that stores header information for an output dataset. It
 * records all formatting information so that later on the tuple field names and
 * types can be reloaded without having to specify them explicitly.
 * 
 * It also stores the original scheme object so that at load time we don't have
 * to worry about that either.
 * 
 * @author Gabor Szabo
 */
public class MetaScheme extends Scheme {
    private static final long serialVersionUID = 8194175541999063797L;

    private static final String schemeFileName = ".pycascading_scheme";
    private static final String headerFileName = ".pycascading_header";
    private static final String typeFileName = ".pycascading_types";

    private Scheme scheme;
    private String outputPath;
    private boolean firstLine = true;
    private boolean typeFileToWrite = true;

    /**
     * Call this to get the original Cascading scheme that the data was written
     * in.
     * 
     * @param inputPath
     *          The path to where the scheme information was stored (normally the
     *          same as the path to the data)
     * @return The Cascading scheme that was used when the data was written.
     * @throws IOException
     */
    public static Scheme getSourceScheme(String inputPath) throws IOException {
        Path path = new Path(inputPath + "/" + schemeFileName);
        FileSystem fs = path.getFileSystem(new Configuration());
        try {
            FSDataInputStream file = fs.open(path);
            ObjectInputStream ois = new ObjectInputStream(file);
            Scheme scheme = (Scheme) ois.readObject();
            Fields fields = (Fields) ois.readObject();
            scheme.setSourceFields(fields);
            ois.close();
            file.close();
            return scheme;
        } catch (ClassNotFoundException e) {
            throw new IOException("Could not read PyCascading file header: " + inputPath + "/" + schemeFileName);
        }
    }

    /**
     * Just here to placehold, but we better be careful to get signatures and Generic types right
     */
    public void sourceConfInit(FlowProcess<?> flowProcess, Tap<?, ?, ?> tap, JobConf conf) {
        // should never be called
    }

    /**
     * Returns the scheme that will store field information and the scheme in
     * outputPath. Additionally, a file called .pycascading_header will be
     * generated, which stores the names of the fields in a TAB-delimited format.
     * 
     * @param scheme
     *          The Cascading scheme to be used to store the data
     * @param outputPath
     *          Path were the metainformation about the scheme and field names
     *          should be stored
     * @return A scheme that can be used to sink the data into
     * @throws IOException
     */
    public static Scheme getSinkScheme(Scheme scheme, String outputPath) throws IOException {
        return new MetaScheme(scheme, outputPath);
    }

    protected MetaScheme(Scheme scheme, String outputPath) throws IOException {
        this.scheme = scheme;
        this.outputPath = outputPath;
    }

    public void sourceConfInit(FlowProcess flowProcess, Tap tap, Object conf) {
        scheme.sourceConfInit(flowProcess, tap, conf);
    }

    public void sinkConfInit(FlowProcess flowProcess, Tap tap, Object conf) {
        scheme.sinkConfInit(flowProcess, tap, conf);
    }

    public boolean source(FlowProcess flowProcess, SourceCall sourceCall) throws IOException {
        return false; // check this! make a no-op for now?
    }

    public void sink(FlowProcess flowProcess, SinkCall sinkCall) throws IOException {
        if (firstLine) {
            Path path = new Path(outputPath + "/" + headerFileName);
            FileSystem fs = path.getFileSystem(((HadoopFlowProcess) flowProcess).getJobConf());
            FSDataOutputStream fsdos = null;
            try {
                if (fs.createNewFile(path)) {
                    fsdos = fs.create(path, true);
                    boolean firstField = true;
                    for (Comparable<?> field : sinkCall.getOutgoingEntry().getFields()) {
                        if (firstField)
                            firstField = false;
                        else
                            fsdos.writeBytes("\t");
                        fsdos.writeBytes(field.toString());
                    }
                    fsdos.writeBytes("\t");
                }
            } catch (IOException ignored) {
            } finally {
                if (null != fsdos) {
                    fsdos.close();
                }
            }
            // TODO: moar
            path = new Path(outputPath + "/" + schemeFileName);
            ObjectOutputStream oos = null;
            try {
                if (fs.createNewFile(path)) {
                    fsdos = fs.create(path, true);
                    oos = new ObjectOutputStream(fsdos);
                    oos.writeObject(scheme);
                    oos.writeObject(sinkCall.getOutgoingEntry().getFields());
                }
            } catch (IOException ignored) {
            } finally {
                if (null != fsdos) {
                    fsdos.close();
                }
                if (null != oos) {
                    oos.close();
                }
            }
        }
        firstLine = false;

        if (typeFileToWrite) {
            Path path = new Path(outputPath + "/" + typeFileName);
            FileSystem fs = path.getFileSystem(((HadoopFlowProcess) flowProcess).getJobConf());
            TupleEntry tupleEntry = null;
            FSDataOutputStream fsdos = null;
            try {
                if (fs.createNewFile(path)) {
                    fsdos = fs.create(path, true);
                    tupleEntry = sinkCall.getOutgoingEntry();
                    for (int i = 0; i < tupleEntry.size(); i++) {
                        Comparable fieldName = null;
                        if (tupleEntry.getFields().size() < tupleEntry.size()) {
                            // We don't have names for the fields
                            fieldName = "";
                        } else {
                            fieldName = tupleEntry.getFields().get(i) + "\t";
                        }
                        Object object = tupleEntry.getObject(i);
                        Class<?> objectClass = (object == null ? Object.class : object.getClass());
                        fsdos.writeBytes(fieldName + objectClass.getName() + "\n");
                    }
                }
            } catch (IOException e) {
            } finally {
                if (null != fsdos) {
                    fsdos.close();
                }
            }
            typeFileToWrite = false;
        }
        scheme.sink(flowProcess, sinkCall);
    }

    /*
      @Override
      public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException {
        // TODO: do it so such that we don't need to specify /user/gabor if the path
        // doesn't start with /
        if (firstLine) {
          Path path = new Path(outputPath + "/" + headerFileName);
          FileSystem fs = path.getFileSystem(new Configuration(true));
          try {
    // We're trying to create the file by just one of the mappers/reducers,
    // the one that can do it first
    if (fs.createNewFile(path)) {
      FSDataOutputStream stream = fs.create(path, true);
      boolean firstField = true;
      for (Comparable<?> field : tupleEntry.getFields()) {
        if (firstField)
          firstField = false;
        else
          stream.writeBytes("\t");
        stream.writeBytes(field.toString());
      }
      stream.writeBytes("\n");
      stream.close();
    }
          } catch (IOException e) {
          }
        
          path = new Path(outputPath + "/" + schemeFileName);
          fs = path.getFileSystem(new Configuration(true));
          try {
    if (fs.createNewFile(path)) {
      FSDataOutputStream stream = fs.create(path, true);
      ObjectOutputStream ostream = new ObjectOutputStream(stream);
      ostream.writeObject(scheme);
      ostream.writeObject(tupleEntry.getFields());
      ostream.close();
      stream.close();
    }
          } catch (IOException e) {
          }
        
          firstLine = false;
        }
        
        if (typeFileToWrite) {
          Path path = new Path(outputPath + "/" + typeFileName);
          FileSystem fs = path.getFileSystem(new Configuration());
          try {
    if (fs.createNewFile(path)) {
      FSDataOutputStream stream = fs.create(path, true);
      for (int i = 0; i < tupleEntry.size(); i++) {
        Comparable fieldName = null;
        if (tupleEntry.getFields().size() < tupleEntry.size()) {
          // We don't have names for the fields
          fieldName = "";
        } else {
          fieldName = tupleEntry.getFields().get(i) + "\t";
        }
        Object object = tupleEntry.getObject(i);
        Class<?> objectClass = (object == null ? Object.class : object.getClass());
        stream.writeBytes(fieldName + objectClass.getName() + "\n");
      }
      stream.close();
    }
          } catch (IOException e) {
          }
          typeFileToWrite = false;
        }
        scheme.sink(tupleEntry, outputCollector);
      }
    */
}