edu.indiana.d2i.htrc.io.dataapi.IDInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for edu.indiana.d2i.htrc.io.dataapi.IDInputFormat.java

Source

/*
#
# Copyright 2012 The Trustees of Indiana University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------
#
# Project: knn
# File:  IDInputFormat.java
# Description:  
#
# -----------------------------------------------------------------
# 
*/

package edu.indiana.d2i.htrc.io.dataapi;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.LineReader;

import edu.indiana.d2i.htrc.HTRCConstants;

public class IDInputFormat<K extends Writable, V extends Writable> extends FileInputFormat<K, V> {
    private static final Log logger = LogFactory.getLog(IDInputFormat.class);

    @SuppressWarnings("unchecked")
    @Override
    public RecordReader<K, V> createRecordReader(InputSplit arg0, TaskAttemptContext arg1)
            throws IOException, InterruptedException {
        return (RecordReader<K, V>) new IDRecorderReader();
    }

    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {
        int numIdsInSplit = job.getConfiguration().getInt(HTRCConstants.MAX_IDNUM_SPLIT, (int) 1e6);
        String hostStr = job.getConfiguration().get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA,
                HTRCConstants.DATA_API_DEFAULT_URL);
        if (hostStr == null)
            throw new RuntimeException("Cannot find hosts of HTRC Data Storage.");
        String[] hosts = hostStr.split(",");

        IDInputSplit split = new IDInputSplit(hosts);
        List<InputSplit> splits = new ArrayList<InputSplit>();
        Path[] dirs = getInputPaths(job);
        try {
            for (int i = 0; i < dirs.length; i++) {
                FileSystem fs = dirs[i].getFileSystem(job.getConfiguration());
                DataInputStream fsinput = new DataInputStream(fs.open(dirs[i]));
                Iterator<Text> idlist = new IDList(fsinput).iterator();
                while (idlist.hasNext()) {
                    Text id = idlist.next();
                    split.addID(id.toString());
                    if (split.getLength() >= numIdsInSplit) {
                        splits.add(split);
                        split = new IDInputSplit(hosts);
                    }
                }

                //            LineReader reader = new LineReader(fsinput);
                //            Text line = new Text();
                //            while (reader.readLine(line) > 0) {
                //               split.addID(line.toString());
                //               if (split.getLength() >= numIdsInSplit) {
                //                  splits.add(split);
                //                  split = new IDInputSplit(hosts);
                //               }
                //            }
                //            reader.close();
            }
            if (split != null && split.getLength() != 0)
                splits.add(split);
        } catch (InterruptedException e) {
            logger.error(e);
        }

        logger.info("#Splits " + splits.size());
        return splits;
    }
}