com.ricemap.spateDB.operations.Tail.java Source code

Java tutorial

Introduction

Here is the source code for com.ricemap.spateDB.operations.Tail.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
 * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */
package com.ricemap.spateDB.operations;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;

import com.ricemap.spateDB.core.ResultCollector;
import com.ricemap.spateDB.io.Text2;
import com.ricemap.spateDB.io.TextSerializable;
import com.ricemap.spateDB.util.CommandLineArguments;

/**
 * Reads the last n lines of a text file
 * @author tonyren, eldawy
 *
 */
public class Tail {

    /**
     * Reads a maximum of n lines from the stream starting from its current
     * position and going backward.
     * 
     * @param in - An input stream. It'll be scanned from its current position
     *   backward till position 0
     * @param n - Maximum number of lines to return
     * @param stockObject - An object used to deserialize lines read. It can
     *   be set to <code>null</code> if output is also <code>null</code>. In this
     *   case, nothing is reported to the output.
     * @param output - An output collector used to report lines read.
     * @return - The position of the beginning of the earliest line read from
     *   buffer.
     * @throws IOException
     */
    public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject,
            ResultCollector<T> output) throws IOException {
        int lines_read = 0;
        long end = in.getPos();
        long offset_of_last_eol = end;
        long last_read_byte = end;

        LongWritable line_offset = new LongWritable();
        Text read_line = new Text();
        Text remainder_from_last_buffer = new Text();
        byte[] buffer = new byte[4096];

        while (last_read_byte > 0 && lines_read < n) {
            // Read next chunk from the back
            long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length;
            in.seek(first_byte_to_read);
            int bytes_to_read = (int) (last_read_byte - first_byte_to_read);
            in.read(buffer, 0, bytes_to_read);
            last_read_byte = first_byte_to_read;

            // Iterate over bytes in this buffer
            int i_last_byte_consumed_in_buffer = bytes_to_read;
            int i_last_byte_examined_in_buffer = bytes_to_read;
            while (i_last_byte_examined_in_buffer > 0 && lines_read < n) {
                byte byte_examined = buffer[--i_last_byte_examined_in_buffer];
                if (byte_examined == '\n' || byte_examined == '\r') {
                    // Found an end of line character
                    // Report this to output unless it's empty
                    long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer;
                    if (offset_of_last_eol - offset_of_this_eol > 1) {
                        if (output != null) {
                            read_line.clear();
                            // +1 is to skip the EOL at the beginning
                            read_line.append(buffer, i_last_byte_examined_in_buffer + 1,
                                    i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1));
                            // Also append bytes remaining from last buffer
                            if (remainder_from_last_buffer.getLength() > 0) {
                                read_line.append(remainder_from_last_buffer.getBytes(), 0,
                                        remainder_from_last_buffer.getLength());
                            }
                            line_offset.set(offset_of_this_eol + 1);
                            stockObject.fromText(read_line);
                            output.collect(stockObject);
                        }
                        lines_read++;
                        remainder_from_last_buffer.clear();
                    }
                    i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer;
                    offset_of_last_eol = offset_of_this_eol;
                }
            }
            if (i_last_byte_consumed_in_buffer > 0) {
                // There are still some bytes not consumed in buffer
                if (remainder_from_last_buffer.getLength() == 0) {
                    // Store whatever is remaining in remainder_from_last_buffer
                    remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer);
                } else {
                    // Prepend remaining bytes to Text
                    Text t = new Text();
                    t.append(buffer, 0, i_last_byte_consumed_in_buffer);
                    t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength());
                    remainder_from_last_buffer = t;
                }
            }
        }

        if (lines_read < n && remainder_from_last_buffer.getLength() > 0) {
            // There is still one last line needs to be reported
            lines_read++;
            if (output != null) {
                read_line = remainder_from_last_buffer;
                line_offset.set(0);
                stockObject.fromText(read_line);
                output.collect(stockObject);
            }
            offset_of_last_eol = -1;
        }

        return offset_of_last_eol + 1;
    }

    /**
     * Reads a maximum of n non-empty lines from the end of the given file.
     * The position of the earliest line read is returned 
     * @param fs
     * @param file
     * @param n
     * @param stockObject
     * @param output
     * @return
     * @throws IOException
     */
    public static <T extends TextSerializable> long tail(FileSystem fs, Path file, int n, T stockObject,
            ResultCollector<T> output) throws IOException {
        FSDataInputStream in = null;
        try {
            in = fs.open(file);
            long length = fs.getFileStatus(file).getLen();
            in.seek(length);
            return tail(in, n, stockObject, output);
        } finally {
            if (in != null)
                in.close();
        }
    }

    public static void main(String[] args) throws IOException {
        CommandLineArguments cla = new CommandLineArguments(args);
        JobConf conf = new JobConf(Sampler.class);
        Path inputFile = cla.getPath();
        FileSystem fs = inputFile.getFileSystem(conf);
        if (!fs.exists(inputFile)) {
            throw new RuntimeException("Input file does not exist");
        }
        int count = cla.getCount();
        TextSerializable stockObject = cla.getShape(true);
        if (stockObject == null)
            stockObject = new Text2();

        tail(fs, inputFile, count, stockObject, new ResultCollector<TextSerializable>() {

            @Override
            public void collect(TextSerializable value) {
                System.out.println(value);
            }
        });
    }
}