edu.indiana.d2i.htrc.io.index.solr.SequentialVectorFromSolr.java Source code

Java tutorial

Introduction

Here is the source code for edu.indiana.d2i.htrc.io.index.solr.SequentialVectorFromSolr.java

Source

/*
#
# Copyright 2012 The Trustees of Indiana University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------
#
# Project: knn
# File:  SequentialVectorFromSolr.java
# Description:  
#
# -----------------------------------------------------------------
# 
*/

package edu.indiana.d2i.htrc.io.index.solr;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.VectorWritable;

import edu.indiana.d2i.htrc.HTRCConstants;

public class SequentialVectorFromSolr extends Configured implements Tool {

    private static final Log logger = LogFactory.getLog(SequentialVectorFromSolr.class);

    private void printUsage() {
        System.out.println("Bad input arguments!");
        System.exit(1);
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length != 4) {
            printUsage();
        }

        String solrURL = args[0];
        String dictionaryFile = args[1];
        String idsFile = args[2];
        String outputFile = args[3];

        logger.info("SequentialVectorFromSolr ");
        logger.info(" - solrURL: " + solrURL);
        logger.info(" - dictionaryFile: " + dictionaryFile);
        logger.info(" - idsFile: " + idsFile); // on HDFS
        logger.info(" - outputFile: " + outputFile); // on HDFS

        Configuration conf = getConf();
        //      conf.set(HTRCConstants.SOLR_MAIN_URL, solrURL);
        conf.set("htrc.solr.url", solrURL);
        conf.set(HTRCConstants.DICTIONARY_PATH, dictionaryFile);

        SolrClient client = new SolrClient(conf, true);
        FileSystem fs = FileSystem.get(conf);

        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(outputFile), Text.class,
                VectorWritable.class);

        long t0 = System.nanoTime();
        DataInputStream fsinput = new DataInputStream(fs.open(new Path(idsFile)));
        BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput));
        String line = null;
        String[] ids = new String[1];
        VectorWritable value = new VectorWritable();
        Text key = new Text();
        int count = 0;
        while ((line = reader.readLine()) != null) {
            ids[0] = line;
            Iterable<NamedVector> termVectors = client.getTermVectors(ids);
            for (NamedVector namedVector : termVectors) {
                value.set(namedVector);
                key.set(namedVector.getName());
                writer.append(key, value);
                count++;
            }
            if (count % 1000 == 0)
                System.out.println("Finish " + count + " volumes.");
        }
        long t1 = System.nanoTime();
        System.out.println("Takes " + (t1 - t0) / 1e9 + " seconds");

        writer.close();
        reader.close();

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new SequentialVectorFromSolr(), args);
        System.exit(res);
    }
}