com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.TextRegexpDataDescriptor.java
Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileOutputStream;

import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.conf.Configuration;

import org.apache.avro.Schema;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericData;

/***********************************************************************
 * Describe class <code>TextRegexpDataDescriptor</code> here.
 *
 * @author "Michael Cafarella"
 * @version 1.0
 * @since 1.0
 * @see DataDescriptor
 ***********************************************************************/
public abstract class TextRegexpDataDescriptor extends GenericDataDescriptor {
    final static int MAX_LINES = 200;

    public static boolean isTextRegexpFile(FileSystem fs, Path p, List<Pattern> regexps) throws IOException {
        int totalCounts = 0;
        int matchCounts[] = new int[regexps.size()];

        BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
        try {
            String cur = null;
            while ((cur = in.readLine()) != null) {
                for (int i = 0; i < regexps.size(); i++) {
                    Pattern patt = regexps.get(i);
                    Matcher m = patt.matcher(cur);
                    if (m.find()) {
                        matchCounts[i]++;
                    }
                }
                totalCounts++;
                if (MAX_LINES >= 0 && totalCounts >= MAX_LINES) {
                    break;
                }
            }
        } finally {
            in.close();
        }

        for (int i = 0; i < matchCounts.length; i++) {
            if (((1.0 * matchCounts[i]) / totalCounts) > 0.3) {
                return true;
            }
        }
        return false;
    }

    List<Pattern> regexps;
    List<Schema> localschemas;

    /**
     * Creates a new <code>TextRegexpDataDescriptor</code> instance,
     * tailored to the structure described in the param list.
     *
     * @param typeIdentifier a <code>String</code> value
     * @param regexps a <code>List<Pattern></code> value
     * @param schemas a <code>List<Schema></code> value
     */
    public TextRegexpDataDescriptor(Path p, FileSystem fs, String fileType, List<Pattern> regexps,
            List<Schema> schemaList) throws IOException {
        super(p, fs, fileType);
        this.regexps = regexps;
        this.localschemas = schemaList;
        schemas.add(new TextRegexpSchemaDescriptor(this, fileType, regexps, localschemas));
    }

    SchemaDescriptor loadSchemaDescriptor(String schemaRepr, String schemaId, byte[] blob) throws IOException {
        // We can wholly ignore any params here.
        return new TextRegexpSchemaDescriptor(this, schemaId, regexps, localschemas);
    }

    ///////////////////////////////////
    // GenericDataDescriptor
    //////////////////////////////////
    public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {
        SchemaDescriptor sd = this.getSchemaDescriptor().get(0);
        List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true);
        Schema schema = unionFreeSchemas.get(0);

        // Open stream to write out Avro contents
        DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
        dataFileWriter.create(schema, dstFs.create(dst, true));
        try {
            BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename())));
            try {
                String rowStr = null;
                GenericData.Record rowRecord = null;
                while ((rowStr = in.readLine()) != null) {
                    for (int i = 0; i < regexps.size(); i++) {
                        Pattern curPattern = regexps.get(i);
                        Schema curSchema = localschemas.get(i);
                        Matcher curMatcher = curPattern.matcher(rowStr);

                        if (curMatcher.find()) {
                            // Create Avro record here
                            rowRecord = new GenericData.Record(curSchema);
                            List<Schema.Field> curFields = curSchema.getFields();

                            for (int j = 0; j < curMatcher.groupCount(); j++) {
                                Schema.Field curField = curFields.get(j);

                                String fieldName = curField.name();
                                Schema fieldType = curField.schema();
                                String rawFieldValue = curMatcher.group(j + 1);

                                Object fieldValue = null;
                                if (fieldType.getType() == Schema.Type.INT) {
                                    fieldValue = Integer.parseInt(rawFieldValue);
                                } else if (fieldType.getType() == Schema.Type.FLOAT) {
                                    fieldValue = Float.parseFloat(rawFieldValue);
                                } else if (fieldType.getType() == Schema.Type.STRING) {
                                    fieldValue = rawFieldValue;
                                }
                                if (fieldValue != null) {
                                    rowRecord.put(fieldName, fieldValue);
                                }
                            }
                            if (rowRecord.getSchema().toString().hashCode() == schema.toString().hashCode()) {
                                dataFileWriter.append(rowRecord);
                            }
                        }
                    }
                }
            } finally {
                in.close();
            }
        } finally {
            dataFileWriter.close();
        }
    }
}