com.cloudera.recordbreaker.learnstructure.LearnStructure.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.learnstructure.LearnStructure.java
Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.learnstructure;

import java.io.*;
import java.util.*;
import org.apache.avro.Schema;
import org.apache.avro.io.JsonEncoder;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumWriter;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;

/*********************************************************
 * LearnStructure is the main file for figuring out pattern-extractors and schemas for a text file.
 *
 * This code operates on a raw text file and emits the extractors/schemas.  The user
 * may decide to remove certain extractors/schemas if they only apply to a tiny number of
 * potential lines in the target text file.
 *
 *********************************************************/
public class LearnStructure {
    public static String SCHEMA_FILENAME = "schema.json";
    public static String JSONDATA_FILENAME = "data.avro.json";
    public static String DATA_FILENAME = "data.avro";
    public static String PARSER_FILENAME = "parser.dat";

    public LearnStructure() {
    }

    /**
     */
    public void inferRecordFormat(FileSystem fs, Path p, FileSystem fs2, Path schemaFile, Path parseTreeFile,
            Path jsonDataFile, Path avroDataFile, boolean verbose, int maxLines) throws IOException {
        // Store parse errors and results
        List<Integer> unparseableLineNos = new ArrayList<Integer>();
        List<String> unparseableStrs = new ArrayList<String>();
        List<Integer> parseableLineNos = new ArrayList<Integer>();
        List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>();

        //
        // Transform the text into a list of "chunks".  A single chunk corresponds to a line of text.  A chunk is a list of Tokens.
        //
        long startRead = System.currentTimeMillis();
        BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
        try {
            String s = in.readLine();
            int lineno = 0;
            while (s != null) {
                if (maxLines >= 0 && lineno >= maxLines) {
                    break;
                }
                List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s);
                if (chunkToks != null) {
                    allChunks.add(chunkToks);
                    parseableLineNos.add(lineno);
                } else {
                    unparseableStrs.add(s);
                    unparseableLineNos.add(lineno);
                }
                s = in.readLine();
                lineno++;
            }
        } finally {
            in.close();
        }

        //
        // Infer type structure from the tokenized chunks
        //
        long start = System.currentTimeMillis();
        InferredType typeTree = TypeInference.infer(allChunks);
        long end = System.currentTimeMillis();
        double loadTime = (start - startRead) / 1000.0;
        double inferTime = (end - start) / 1000.0;
        double totalTime = (end - startRead) / 1000.0;
        if (verbose) {
            System.err.println("Number of chunks: " + allChunks.size());
            System.err.println("Elapsed load time: " + loadTime);
            System.err.println("Elapsed inference time: " + inferTime);
            System.err.println("Total execution time: " + totalTime);
        }

        //
        // The existing type tree is now correct, but could probably be more succinct.
        // We can now improve/rewrite it.
        //

        //
        // Should every top-level type be ARRAY, so as to allow repeated log lines?
        // Or does the Avro format allow an implict top-level repeating structure?
        //

        //
        // Dump the results.  We emit:
        // 1) A JSON/Avro schema
        // 2) A serialized parser program that can consume data and emit Avro files using the given schema
        //
        Schema s = typeTree.getAvroSchema();
        if (schemaFile != null) {
            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs2.create(schemaFile)));
            try {
                out.write(s.toString(true));
            } finally {
                out.close();
            }
        }
        if (parseTreeFile != null) {
            DataOutputStream outd = new DataOutputStream(new BufferedOutputStream(fs2.create(parseTreeFile)));
            try {
                typeTree.write(outd);
            } finally {
                outd.close();
            }
        }

        //
        // Apply the typetree's parser.
        //
        if (jsonDataFile != null) {
            Schema schema = typeTree.getAvroSchema();
            GenericDatumWriter jsonGDWriter = new GenericDatumWriter(schema);
            BufferedOutputStream outJson = new BufferedOutputStream(fs2.create(jsonDataFile));
            JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, outJson);
            try {
                in = new BufferedReader(new InputStreamReader(fs.open(p)));
                try {
                    String str = in.readLine();
                    while (str != null) {
                        GenericContainer gct = typeTree.parse(str);

                        if (gct != null) {
                            jsonGDWriter.write(gct, encoder);
                        }
                        str = in.readLine();
                    }
                } finally {
                    in.close();
                }
            } finally {
                encoder.flush();
                outJson.close();
            }
        }

        if (avroDataFile != null) {
            int numGoodParses = 0;
            int lineno = 0;
            Schema schema = typeTree.getAvroSchema();

            GenericDatumWriter gdWriter = new GenericDatumWriter(schema);
            DataFileWriter outData = new DataFileWriter(gdWriter);
            outData = outData.create(schema, fs2.create(avroDataFile));

            try {
                in = new BufferedReader(new InputStreamReader(fs.open(p)));
                try {
                    String str = in.readLine();
                    while (str != null) {
                        GenericContainer gct = typeTree.parse(str);
                        if (gct != null) {
                            numGoodParses++;
                            outData.append(gct);
                        } else {
                            if (verbose) {
                                System.err.println("unparsed line: '" + str + "'");
                            }
                        }
                        str = in.readLine();
                        lineno++;
                    }
                } finally {
                    in.close();
                }
            } finally {
                outData.close();
            }
            if (verbose) {
                System.err.println();
                System.err.println("Total # input lines: " + lineno);
                System.err.println("Total # lines parsed correctly: " + numGoodParses);
            }
        }
    }

    //////////////////////////////////////////
    // main()
    //////////////////////////////////////////
    public static void main(String argv[]) throws IOException {
        if (argv.length < 2) {
            System.err.println("Usage: LearnStructure <input-datafile> <outdir> (-emitAvro (true)|false)");
            return;
        }
        FileSystem localFS = FileSystem.getLocal(new Configuration());
        boolean emitAvro = true;
        int i = 0;
        Path f = new Path(new File(argv[i++]).getCanonicalPath());
        File outdir = new File(argv[i++]).getCanonicalFile();
        for (; i < argv.length; i++) {
            if ("-emitAvro".equals(argv[i])) {
                i++;
                emitAvro = "true".equals(argv[i]);
            }
        }

        System.err.println("Input file: " + f.toString());
        System.err.println("Output directory: " + outdir.getCanonicalPath());
        if (outdir.exists()) {
            throw new IOException("Output directory already exists: " + outdir);
        }
        outdir.mkdirs();
        Path schemaFile = new Path(outdir.getCanonicalPath(), SCHEMA_FILENAME);
        Path parseTreeFile = new Path(outdir.getCanonicalPath(), PARSER_FILENAME);
        Path jsonDataFile = null;
        Path avroDataFile = null;
        if (emitAvro) {
            jsonDataFile = new Path(outdir.getCanonicalPath(), JSONDATA_FILENAME);
            avroDataFile = new Path(outdir.getCanonicalPath(), DATA_FILENAME);
        }

        LearnStructure ls = new LearnStructure();
        ls.inferRecordFormat(localFS, f, localFS, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true, -1);
    }
}