org.gbif.ocurrence.index.solr.SolrOutputFormat.java Source code

Introduction

Here is the source code for org.gbif.ocurrence.index.solr.SolrOutputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gbif.ocurrence.index.solr;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Progressable;

public class SolrOutputFormat<K, V> extends FileOutputFormat<K, V> {

    private static final Log LOG = LogFactory.getLog(SolrOutputFormat.class);

    /**
     * The parameter used to pass the solr config zip file information. This will
     * be the hdfs path to the configuration zip file
     */
    public static final String SETUP_OK = "solr.output.format.setup";

    /** The key used to pass the zip file name through the configuration. */
    public static final String ZIP_NAME = "solr.zip.name";

    /**
     * The base name of the zip file containing the configuration information.
     * This file is passed via the distributed cache using a unique name, obtained
     * via {@link #getZipName(Configuration jobConf)}.
     */
    public static final String ZIP_FILE_BASE_NAME = "solr.zip";

    /**
     * The key used to pass the boolean configuration parameter that instructs for
     * regular or zip file output
     */
    public static final String OUTPUT_ZIP_FILE = "solr.output.zip.format";

    static int defaultSolrWriterThreadCount = 2;

    public static final String SOLR_WRITER_THREAD_COUNT = "solr.record.writer.num.threads";

    static int defaultSolrWriterQueueSize = 100;

    public static final String SOLR_WRITER_QUEUE_SIZE = "solr.record.writer.max.queues.size";

    static int defaultSolrBatchSize = 20;

    public static final String SOLR_RECORD_WRITER_BATCH_SIZE = "solr.record.writer.batch.size";

    public static String getSetupOk() {
        return SETUP_OK;
    }

    /** Get the number of threads used for index writing */
    public static void setSolrWriterThreadCount(int count, Configuration conf) {
        conf.setInt(SOLR_WRITER_THREAD_COUNT, count);
    }

    /** Set the number of threads used for index writing */
    public static int getSolrWriterThreadCount(Configuration conf) {
        return conf.getInt(SOLR_WRITER_THREAD_COUNT, defaultSolrWriterThreadCount);
    }

    /**
     * Set the maximum size of the the queue for documents to be written to the
     * index.
     */
    public static void setSolrWriterQueueSize(int count, Configuration conf) {
        conf.setInt(SOLR_WRITER_QUEUE_SIZE, count);
    }

    /** Return the maximum size for the number of documents pending index writing. */
    public static int getSolrWriterQueueSize(Configuration conf) {
        return conf.getInt(SOLR_WRITER_QUEUE_SIZE, defaultSolrWriterQueueSize);
    }

    /**
     * Return the file name portion of the configuration zip file, from the
     * configuration.
     */
    public static String getZipName(Configuration conf) {
        return conf.get(ZIP_NAME, ZIP_FILE_BASE_NAME);
    }

    /**
     * configure the job to output zip files of the output index, or full
     * directory trees. Zip files are about 1/5th the size of the raw index, and
     * much faster to write, but take more cpu to create. zip files are ideal for
     * deploying into a katta managed shard.
     * 
     * @param output
     * @param conf
     */
    public static void setOutputZipFormat(boolean output, Configuration conf) {
        conf.setBoolean(OUTPUT_ZIP_FILE, output);
    }

    /**
     * return true if the output should be a zip file of the index, rather than
     * the raw index
     * 
     * @param conf
     * @return
     */
    public static boolean isOutputZipFormat(Configuration conf) {
        return conf.getBoolean(OUTPUT_ZIP_FILE, false);
    }

    @Override
    public void checkOutputSpecs(JobContext job) throws IOException {
        super.checkOutputSpecs(job);
        /* if (job.getConfiguration().get(SETUP_OK) == null) {
           throw new IOException("Solr home cache not set up!");
         }*/
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        return new SolrRecordWriter<K, V>(context);
    }

    public static void setupSolrHomeCache(File solrHome, Configuration jobConf) throws IOException {
        if (solrHome == null || !(solrHome.exists() && solrHome.isDirectory())) {
            throw new IOException("Invalid solr.home: " + solrHome);
        }
        File tmpZip = File.createTempFile("solr", "zip");
        createZip(solrHome, tmpZip);
        // Make a reasonably unique name for the zip file in the distributed cache
        // to avoid collisions if multiple jobs are running.
        String hdfsZipName = UUID.randomUUID().toString() + '.' + ZIP_FILE_BASE_NAME;
        jobConf.set(ZIP_NAME, hdfsZipName);

        Path zipPath = new Path("/tmp", getZipName(jobConf));
        FileSystem fs = FileSystem.get(jobConf);
        fs.copyFromLocalFile(new Path(tmpZip.toString()), zipPath);
        final URI baseZipUrl = fs.getUri().resolve(zipPath.toString() + '#' + getZipName(jobConf));

        DistributedCache.addCacheArchive(baseZipUrl, jobConf);
        LOG.info("Set Solr cache: " + Arrays.asList(DistributedCache.getCacheArchives(jobConf)));
        // Actually send the path for the configuration zip file
        jobConf.set(SETUP_OK, zipPath.toString());
    }

    private static void createZip(File dir, File out) throws IOException {
        HashSet<File> files = new HashSet<File>();
        // take only conf/ and lib/
        for (String allowedDirectory : SolrRecordWriter.getAllowedConfigDirectories()) {
            File configDir = new File(dir, allowedDirectory);
            boolean configDirExists;
            /** If the directory does not exist, and is required, bail out */
            if (!(configDirExists = configDir.exists())
                    && SolrRecordWriter.isRequiredConfigDirectory(allowedDirectory)) {
                throw new IOException(String.format("required configuration directory %s is not present in %s",
                        allowedDirectory, dir));
            }
            if (!configDirExists) {
                continue;
            }
            listFiles(configDir, files); // Store the files in the existing, allowed
                                         // directory configDir, in the list of files
                                         // to store in the zip file
        }

        out.delete();
        int subst = dir.toString().length();
        ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out));
        byte[] buf = new byte[1024];
        for (File f : files) {
            ZipEntry ze = new ZipEntry(f.toString().substring(subst));
            zos.putNextEntry(ze);
            InputStream is = new FileInputStream(f);
            int cnt;
            while ((cnt = is.read(buf)) >= 0) {
                zos.write(buf, 0, cnt);
            }
            is.close();
            zos.flush();
            zos.closeEntry();
        }
        zos.close();
    }

    private static void listFiles(File dir, Set<File> files) throws IOException {
        File[] list = dir.listFiles();
        for (File f : list) {
            if (f.isFile()) {
                files.add(f);
            } else {
                listFiles(f, files);
            }
        }
    }

    public static int getBatchSize(Configuration jobConf) {
        // TODO Auto-generated method stub
        return jobConf.getInt(SolrOutputFormat.SOLR_RECORD_WRITER_BATCH_SIZE, defaultSolrBatchSize);
    }

    public static void setBatchSize(int count, Configuration jobConf) {
        jobConf.setInt(SOLR_RECORD_WRITER_BATCH_SIZE, count);
    }

}