com.indeed.imhotep.builder.tsv.EasyIndexBuilderFromTSV.java Source code

Introduction

Here is the source code for com.indeed.imhotep.builder.tsv.EasyIndexBuilderFromTSV.java
Source

/*
 * Copyright (C) 2014 Indeed Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.indeed.imhotep.builder.tsv;

import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;
import com.indeed.util.core.io.Closeables2;
import com.indeed.imhotep.builder.tsv.input.CSVInputReader;
import com.indeed.imhotep.builder.tsv.input.InputReader;
import com.indeed.imhotep.builder.tsv.input.TSVInputReaderNoEscaping;
import com.indeed.imhotep.index.builder.util.EasyIndexBuilder;
import com.indeed.imhotep.index.builder.util.IndexWriter;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.joda.time.DateTimeZone;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.TimeZone;
import java.util.zip.GZIPInputStream;

/**
 * @author vladimir
 */

@SuppressWarnings("UnusedDeclaration")
public class EasyIndexBuilderFromTSV extends EasyIndexBuilder {
    static {
        TimeZone.setDefault(TimeZone.getTimeZone("GMT-6"));
        DateTimeZone.setDefault(DateTimeZone.forOffsetHours(-6));
    }
    private static final Logger log = Logger.getLogger(EasyIndexBuilderFromTSV.class);
    protected int timeFieldIndex = -1;
    protected long startTimestampMS;
    protected long endTimestampMS;
    protected IndexField[] indexFields;
    protected boolean PRODUCE_FLAMDEX = true;
    int rowCount = 0;

    @Override
    protected void setup() {
        super.setup();
        //        if(getInputFilePath().toString().startsWith("hdfs:")) {
        //            // need to login to kerberos
        //            try {
        //                KerberosUtils.loginFromKeytab(null, null);
        //            } catch (IOException e) {
        //                log.error("Failed to log in to Kerberos", e);
        //            }
        //        }
    }

    protected InputReader getInputReader() {
        final Path inputFile = getInputFilePath();
        final BufferedReader fileReader = getInputFileReader(inputFile);
        final String fileName = inputFile.getName();
        if (fileName.endsWith(".csv") || fileName.endsWith(".csv.gz")) {
            return new CSVInputReader(fileReader);
        } else {
            // Should we also using the escaped TSV in some situations? What would be the trigger?
            return new TSVInputReaderNoEscaping(fileReader);
        }
    }

    @Override
    protected void init() {
        super.init();

        if (PRODUCE_FLAMDEX) {
            final IndexWriter indexWriter = new FlamdexIndexWriter(indexDir);
            setIndexWriter(indexWriter);
        }

        startTimestampMS = options.start;
        endTimestampMS = options.end;

        final InputReader reader = getInputReader();
        try {
            final Iterator<String[]> iterator = reader.iterator();
            inferFieldsFromHeader(iterator);
            if (PRODUCE_FLAMDEX) {
                detectIntFields(iterator);
            }
        } catch (IOException e) {
            throw Throwables.propagate(e);
        } finally {
            Closeables2.closeQuietly(reader, log);
        }
    }

    private void detectIntFields(Iterator<String[]> iterator) throws IOException {
        final int[] intValCount = new int[indexFields.length];
        final int[] blankValCount = new int[indexFields.length];
        final boolean[] isInt = new boolean[indexFields.length];
        Arrays.fill(isInt, true);
        if (!iterator.hasNext()) {
            throw new RuntimeException(
                    "No data is available in the input file. At least one line of data past the header is required");
        }
        log.info("Scanning the file to detect int fields");
        while (iterator.hasNext()) {
            final String[] values = iterator.next();
            final int valueCount = Math.min(values.length, indexFields.length);
            rowCount++;
            for (int i = 0; i < valueCount; i++) {
                if (!isInt[i]) {
                    continue; // we already know this is not an integer
                }

                if (Longs.tryParse(values[i]) != null) {
                    intValCount[i]++;
                } else if (values[i].isEmpty()) {
                    blankValCount[i]++;
                }

                if (rowCount > 10000 && !isIntField(intValCount[i], blankValCount[i], rowCount)) {
                    isInt[i] = false;
                }
            }
        }
        int intFieldCount = 0;
        List<String> intFields = Lists.newArrayList();
        for (int i = 0; i < indexFields.length; i++) {
            boolean isIntField = isInt[i] && isIntField(intValCount[i], blankValCount[i], rowCount);
            if (isIntField) {
                intFields.add(indexFields[i].getName());
            }
            indexFields[i].setIntField(isIntField);
        }
        Collections.sort(intFields);
        log.info("Int fields detected: " + Joiner.on(",").join(intFields));
    }

    /**
     * Tries to guess if the field should be considered an int field rather than
     * a string field based on counts of int values, blanks, and other strings
     * in it. We consider it an int field if it has at least 20% ints, less than
     * 10% other strings and the rest can be either ints or blanks.
     */
    private static boolean isIntField(int intValCount, int blankValCount, int rowCount) {
        if (intValCount < Math.ceil(rowCount / 20)) {
            // there are under 20% ints, so consider it a string field
            return false;
        }
        // we have a good number of int values, consider blanks to be 0s
        return (intValCount + blankValCount) > rowCount / 10 * 9; // require
                                                                  // over 90%
                                                                  // fit to be
                                                                  // considered
                                                                  // int field
    }

    private void inferFieldsFromHeader(Iterator<String[]> iterator) throws IOException {
        if (!iterator.hasNext()) {
            throw new RuntimeException("The provided file didn't have a header with field names in the first line");
        }

        final String[] fieldHeaders = iterator.next();
        indexFields = new IndexField[fieldHeaders.length];
        for (int i = 0; i < fieldHeaders.length; i++) {
            String field = fieldHeaders[i];
            boolean tokenized = false;
            boolean bigram = false;
            boolean idxFullField = true;
            if (field.endsWith("**")) {
                bigram = true;
                tokenized = true;
                idxFullField = true;
                field = field.substring(0, field.length() - 2);
            } else if (field.endsWith("*")) {
                bigram = false;
                tokenized = true;
                idxFullField = true;
                field = field.substring(0, field.length() - 1);
            } else if (field.endsWith("+")) {
                bigram = false;
                tokenized = true;
                field = field.substring(0, field.length() - 1);
                idxFullField = false;
            }

            validateFieldName(field);

            final IndexField indexField = new IndexField(field, tokenized, bigram, idxFullField);
            indexFields[i] = indexField;
            if ("time".equals(field) || "unixtime".equals(field)) {
                timeFieldIndex = i;
            }
        }
    }

    private BufferedReader getInputFileReader(Path inputFile) {
        try {
            final FileSystem hdfs = getHDFS(inputFile);
            final Path qualifiedInputFile = inputFile.makeQualified(hdfs);
            if (!hdfs.exists(inputFile)) {
                throw new RuntimeException("The provided input file doesn't exist " + qualifiedInputFile
                        + "\nFor hdfs files use 'hdfs:' prefix like hdfs:/tmp/file.tsv");
            }
            log.info("Reading TSV data from " + qualifiedInputFile);
            InputStream inputStream = hdfs.open(inputFile);
            if (inputFile.getName().endsWith(".gz")) {
                inputStream = new GZIPInputStream(inputStream);
            }
            return new BufferedReader(new InputStreamReader(inputStream, Charsets.UTF_8));
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    private void validateFieldName(String field) {
        final String validationRegex = "[A-Za-z_][A-Za-z_0-9]*";
        if (!field.matches(validationRegex)) {
            throw new RuntimeException("Please make field name " + field + " conform to regex: " + validationRegex);
        }
    }

    @Override
    protected void loop() {
        final InputReader reader = getInputReader();
        try {
            Iterator<String[]> iterator = reader.iterator();
            iterator.next(); // skip header
            while (iterator.hasNext()) {
                final String[] values = iterator.next();
                final int valueCount = Math.min(values.length, indexFields.length);
                if (valueCount != indexFields.length) {
                    // inconsistent number of columns detected
                    // TODO: error? log?
                }
                long docTimestamp = startTimestampMS; // default in case we don't have a time column
                for (int i = 0; i < valueCount; i++) {
                    final String value = values[i];
                    final IndexField field = indexFields[i];
                    if (i == timeFieldIndex) {
                        long timestamp;
                        try {
                            timestamp = Long.parseLong(value);
                            if (timestamp < Integer.MAX_VALUE) {
                                timestamp *= 1000; // assume it's in seconds and convert to milliseconds
                            }
                        } catch (NumberFormatException e) {
                            // TODO
                            //                            log.warn("Illegal timestamp: " + value);
                            continue;
                        }
                        if (timestamp < startTimestampMS || timestamp > endTimestampMS) { // should this be inclusive of endTS?
                            log.warn("Timestamp outside range: " + timestamp + ". Should be between: "
                                    + startTimestampMS + " and " + endTimestampMS);
                            continue;
                        }
                        docTimestamp = timestamp;
                    } else {
                        if (field.isIntField()) {
                            final Long intValue = Longs.tryParse(value);
                            if (intValue != null) {
                                addTerm(field.getName(), intValue);
                            } else {
                                // don't index non-int values at all
                                if (!value.isEmpty()) {
                                    field.incrementIllegalIntValue();
                                }
                            }
                        } else { // string term
                            if (field.isIdxFullField()) {
                                addTerm(field.getName(), value, false);
                            }
                            if (field.isTokenized()) {
                                /* Use the tokenized field name only 
                                 * if the full name has already been used 
                                 */
                                String fn = field.isIdxFullField() ? field.getNameTokenized() : field.getName();
                                addTerm(fn, value, true);
                            }
                            if (field.isBigram()) {
                                addBigramTerm(field.getNameBigram(), value);
                            }
                        }
                    }
                }

                saveDocument(docTimestamp);
            }
            for (IndexField field : indexFields) {
                int badIntVals = field.getIllegalIntValues();
                if (badIntVals > 0) {
                    log.warn("Column " + field.getName() + " had " + badIntVals + " (" + badIntVals * 100 / rowCount
                            + "%) illegal int values");
                }
            }
        } finally {
            Closeables2.closeQuietly(reader, log);
        }
    }

    public FileSystem getHDFS(Path inputFilePath) {
        try {
            return inputFilePath.getFileSystem(new org.apache.hadoop.conf.Configuration());
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    protected Path getInputFilePath() {
        String inputFilePath = options.extra;
        if (Strings.isNullOrEmpty(inputFilePath)) {
            throw new RuntimeException("Expecting extra arg to be the HDFS path of the TSV input file.");
        }
        if (!inputFilePath.startsWith("hdfs:") && !inputFilePath.startsWith("s3n:")
                && !inputFilePath.startsWith("file:")) {
            inputFilePath = "file:" + inputFilePath;
        }
        return new Path(inputFilePath);
    }
}