eu.scape_project.arcunpacker.mapreduce.ArcRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for eu.scape_project.arcunpacker.mapreduce.ArcRecordReader.java

Source

/*
 *  Copyright 2012 The SCAPE Project Consortium.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */
package eu.scape_project.arcunpacker.mapreduce;

import eu.scape_project.arcunpacker.ArcRecord;
import eu.scape_project.arcunpacker.HadoopArcRecord;
import eu.scape_project.arcunpacker.HeritrixWrapper;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;

import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author shsdev https://github.com/shsdev
 * @version 0.2
 */
public final class ArcRecordReader extends RecordReader<Text, HadoopArcRecord> {

    private HeritrixWrapper archiveReaderDelegate;
    private Text key;
    private HadoopArcRecord value;

    @Override
    public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
        //throw new UnsupportedOperationException("Unused.");

        FileSplit fileSplit = (FileSplit) is;
        try {
            Path path = fileSplit.getPath();

            FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

            FSDataInputStream fileInputStream = fileSystem.open(path);
            FileStatus fileStatus = fileSystem.getFileStatus(path);
            long fileLength = fileStatus.getLen();

            archiveReaderDelegate = new HeritrixWrapper(path.getName(), fileInputStream, fileLength);
            key = new Text();
            value = new HadoopArcRecord();

        } catch (IOException ex) {
            Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
            throw new IOException(ex);
        }

    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        try {
            boolean result = archiveReaderDelegate.nextKeyValue();
            if (result) {
                key.set(archiveReaderDelegate.getCurrentID());
                archiveReaderDelegate.getCurrentArcRecord(value);
            }
            return result;
        } catch (Exception e) {
            //TODO proper logging

            System.out.println(e);
            return false;
        }
    }

    @Override
    public Text getCurrentKey() {
        return key;
    }

    @Override
    public HadoopArcRecord getCurrentValue() {
        return value;
    }

    @Override
    public void close() throws IOException {
    }

    @Override
    public float getProgress() throws IOException {
        return ((archiveReaderDelegate.getPosition() + 0.0f) / archiveReaderDelegate.getFileLength());
    }

}