org.opensextant.matching.DataLoader.java Source code

Introduction

Here is the source code for org.opensextant.matching.DataLoader.java
Source

/*
 This software was produced for the U. S. Government
 under Contract No. W15P7T-11-C-F600, and is
 subject to the Rights in Noncommercial Computer Software
 and Noncommercial Computer Software Documentation
 Clause 252.227-7014 (JUN 1995)
    
 Copyright 2013 The MITRE Corporation. All Rights Reserved.
    
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    
 http://www.apache.org/licenses/LICENSE-2.0
    
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
package org.opensextant.matching;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.SolrResponseBase;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DataLoader {

    private static ModifiableSolrParams loadParams = new ModifiableSolrParams();
    private static String requestHandler = "/update";

    /** Log object. */
    private static final Logger LOGGER = LoggerFactory.getLogger(DataLoader.class);

    /** Some common params. */
    static {
        loadParams.set("update.contentType", "text/csv");
        loadParams.set("skipLines", "1");
        loadParams.set("optimize", "true");
        loadParams.set("separator", "\t");
        loadParams.set("header", "false");
        loadParams.set("trim", "on");
        loadParams.set("overwrite", "false");
        loadParams.set("debug", "true");
    }

    private DataLoader() {
    }

    public static void main(String[] args) throws Exception {

        if (args.length < 3 || args.length > 4) {
            usage();
        }

        String scheme = args[0];
        String inputForm = args[1];
        String csvFilePath = args[2];
        String solrhome = "";
        if (args.length == 4) {
            solrhome = args[3];
        }

        // get a SolrServer with the proper core
        SolrClient solrServer = getSolrServer(scheme, solrhome);

        // convert indexed content to flat list
        // currently creates a temp file, could stream?
        if ("index".equalsIgnoreCase(inputForm)) {
            csvFilePath = flatten(csvFilePath);
        }

        try {

            // set the fieldnames param for the selected schema
            final ModifiableSolrParams params = new ModifiableSolrParams(loadParams);
            if ("gazetteer".equalsIgnoreCase(scheme)) {
                params.set("fieldnames", MatcherFactory.getGazetteerFieldNamesLoader());
            } else {
                params.set("fieldnames", MatcherFactory.getVocabFieldNames());
            }

            // build the update request
            final ContentStreamUpdateRequest updateRequest = new ContentStreamUpdateRequest(requestHandler);
            updateRequest.setParams(params);

            ContentStream inStream = new ContentStreamBase.FileStream(new File(csvFilePath));

            // add the input file as a stream to the request
            updateRequest.addContentStream(inStream);

            // make the call
            SolrResponseBase response = null;
            try {
                response = updateRequest.process(solrServer);
                // see what happened
                printResponse(response);
            } catch (Exception e) {
                LOGGER.error("Exception in submitting Solr request " + e);
            }

        } finally {
            // cleanup
            solrServer.close();
        }
    }

    private static void usage() {
        String tmp = "DataLoader <scheme> <inputformat> <inputfilepath> <solrhome> where\n";
        tmp = tmp + " <scheme> = gazetteer | vocabulary\n";
        tmp = tmp + " <inputformat> = csv | index\n";
        tmp = tmp + " <inputfilepath> = file to be loaded\n";
        tmp = tmp + " <solrhome> = path to solr home (optional)\n";

        LOGGER.info(tmp);
    }

    private static String flatten(String currentPath) {

        File topDir = new File(currentPath).getParentFile();

        File input = new File(currentPath);

        Map<File, String> index = new HashMap<File, String>();

        // read the index file into the index Map

        // loop over the lines of the index file
        LineIterator indexIter = null;
        try {
            indexIter = FileUtils.lineIterator(input, "UTF-8");
        } catch (IOException e) {
            LOGGER.error("Couldnt read from " + input.getName() + ":", e);
            return null;
        }

        if (indexIter != null) {
            while (indexIter.hasNext()) {
                // get next line
                String line = indexIter.next();
                String[] pieces = line.split(":");
                File subFile = new File(topDir, pieces[0]);
                String tmpVal = pieces[1];

                if (pieces.length >= 3) {
                    tmpVal = tmpVal + ":" + pieces[2];
                }

                index.put(subFile, tmpVal);

            }
        }
        File tmp = null;
        try {
            tmp = File.createTempFile("vocab", "txt");
        } catch (IOException e) {
            LOGGER.error("Could not create temp file when flattening vocab:", e);
            return null;
        }

        // loop over the files mentioned in the index and write to temp file
        int indexID = 0;
        for (File in : index.keySet()) {
            String[] catAndTax = index.get(in).split(":");
            String cat = catAndTax[0];
            String tax = "";
            if (catAndTax.length > 1) {
                tax = catAndTax[1];
            } else {
                tax = "NONE";
            }

            // loop over the lines of the subfiles file
            // write the new flat contents to the temp file
            LineIterator contentIter = null;
            try {
                contentIter = FileUtils.lineIterator(in, "UTF-8");
            } catch (IOException e) {
                LOGGER.error("Couldnt read from " + in.getName(), e);
                return null;
            }

            if (contentIter != null) {
                while (contentIter.hasNext()) {
                    // get next line
                    String line = contentIter.next();

                    // concat the pieces
                    String out = indexID + "\t" + line + "\t" + cat + "\t" + tax + "\n";

                    // write all pieces to temp

                    try {
                        FileUtils.writeStringToFile(tmp, out, "UTF-8", true);
                    } catch (IOException e) {
                        LOGGER.error("Could not write to temp file when flattening vocab:", e);
                    }
                    indexID++;

                }
            }
        }
        LOGGER.info("Flattened " + indexID + " vocabulary entries to temp file");
        // return temp file path

        return tmp.getAbsolutePath();
    }

    private static void printResponse(SolrResponseBase response) {
        LOGGER.info(response.toString());
    }

    private static SolrClient getSolrServer(String scheme, String solrhome) {

        MatcherFactory.config(solrhome);
        MatcherFactory.start();

        SolrClient svr = null;

        if ("gazetteer".equalsIgnoreCase(scheme)) {
            svr = MatcherFactory.getSolrServerGeo();
        } else {
            svr = MatcherFactory.getSolrServerVocab();
        }

        return svr;

    }

}