edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java Source code

Java tutorial

Introduction

Here is the source code for edu.umd.cloud9.collection.clue.ClueWarcForwardIndex.java

Source

/*
 * Cloud9: A Hadoop toolkit for working with big data
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.collection.clue;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.log4j.Logger;

import edu.umd.cloud9.collection.DocumentForwardIndex;

public class ClueWarcForwardIndex implements DocumentForwardIndex<ClueWarcRecord> {
    private static final Logger LOG = Logger.getLogger(ClueWarcForwardIndex.class);
    private static DecimalFormat FORMAT5 = new DecimalFormat("00000");

    private Configuration conf;

    private int[] docnos;
    private int[] offsets;
    private short[] fileno;
    private String collectionPath;
    private int lastDocno = -1;

    private ClueWarcDocnoMapping docnoMapping = new ClueWarcDocnoMapping();

    public ClueWarcForwardIndex() {
    }

    @Override
    public void loadIndex(Path index, Path mapping, FileSystem fs) throws IOException {
        this.conf = fs.getConf();

        LOG.info("Loading forward index: " + index);
        docnoMapping.loadMapping(mapping, fs);

        FSDataInputStream in = fs.open(index);

        // Class name; throw away.
        in.readUTF();
        collectionPath = in.readUTF();

        int blocks = in.readInt();

        LOG.info(blocks + " blocks expected");
        docnos = new int[blocks];
        offsets = new int[blocks];
        fileno = new short[blocks];

        for (int i = 0; i < blocks; i++) {
            docnos[i] = in.readInt();
            offsets[i] = in.readInt();
            fileno[i] = in.readShort();

            if (i > 0 && i % 100000 == 0)
                LOG.info(i + " blocks read");
        }

        in.close();
    }

    @Override
    public String getCollectionPath() {
        return collectionPath;
    }

    @Override
    public ClueWarcRecord getDocument(int docno) {
        long start = System.currentTimeMillis();

        // Trap invalid docnos.
        if (docno < getFirstDocno() || docno > getLastDocno()) {
            return null;
        }

        int idx = Arrays.binarySearch(docnos, docno);

        if (idx < 0) {
            idx = -idx - 2;
        }

        DecimalFormat df = new DecimalFormat("00000");
        String file = collectionPath + "/part-" + df.format(fileno[idx]);

        LOG.info("fetching docno " + docno + ": seeking to " + offsets[idx] + " at " + file);

        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));

            IntWritable key = new IntWritable();
            ClueWarcRecord value = new ClueWarcRecord();

            reader.seek(offsets[idx]);

            while (reader.next(key)) {
                if (key.get() == docno) {
                    break;
                }
            }

            reader.getCurrentValue(value);
            reader.close();

            long duration = System.currentTimeMillis() - start;

            LOG.info(" docno " + docno + " fetched in " + duration + "ms");
            return value;
        } catch (IOException e) {
            e.printStackTrace();
        }

        return null;
    }

    @Override
    public ClueWarcRecord getDocument(String docid) {
        return getDocument(docnoMapping.getDocno(docid));
    }

    @Override
    public int getDocno(String docid) {
        return docnoMapping.getDocno(docid);
    }

    @Override
    public String getDocid(int docno) {
        return docnoMapping.getDocid(docno);
    }

    @Override
    public int getFirstDocno() {
        return docnos[0];
    }

    @Override
    public int getLastDocno() {
        if (lastDocno != -1) {
            return lastDocno;
        }

        // Find the last entry, and then see all the way to the end of the collection.
        int idx = docnos.length - 1;

        String file = collectionPath + "/part-" + FORMAT5.format(fileno[idx]);

        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(file)));
            IntWritable key = new IntWritable();

            reader.seek(offsets[idx]);

            while (reader.next(key))
                ;
            lastDocno = key.get();
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return lastDocno;
    }

    /**
     * Simple program the provides access to the document contents.
     */
    public static void main(String[] args) throws IOException {
        if (args.length < 4) {
            System.out.println("usage: [findex] [mapping-file] [getDocno|getDocid] [docid/docno]");
            System.exit(-1);
        }

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        System.out.println("forward index: " + args[0]);
        System.out.println("mapping file: " + args[1]);

        ClueWarcForwardIndex findex = new ClueWarcForwardIndex();
        findex.loadIndex(new Path(args[0]), new Path(args[1]), fs);

        if (args[2].equals("getDocno")) {
            System.out.println("looking up docno " + args[3]);
            System.out.println(findex.getDocument(Integer.parseInt(args[3])).getDisplayContent());
        } else if (args[2].equals("getDocid")) {
            System.out.println("looking up docid " + args[3]);
            System.out.println(findex.getDocument(args[3]).getDisplayContent());
        } else {
            System.out.println("Invalid command!");
            System.out.println("usage: [findex] [mapping-file] [getDocno|getDocid] [docid/docno]");
        }
    }
}