com.datasalt.pangool.solr.TupleSolrOutputFormat.java Source code

Introduction

Here is the source code for com.datasalt.pangool.solr.TupleSolrOutputFormat.java
Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.solr;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.datasalt.pangool.io.ITuple;

/**
 * Instantiable OutputFormat that can be used in Pangool for indexing {@link ITuple} in SOLR. It behaves similar to
 * SOLR-1301's SolrOutputFormat with the difference that configuration is passed via instance (constructor params). This
 * allows us to easily have multiple {@link TupleSolrOutputFormat} in the same Pangool Job. Also, it is much easier to
 * configure: it just needs to be instantiated (no need to call multiple static methods to configure it). Everything
 * will be configured underneath.
 * <p>
 * Things that can be configured via constructor:
 * <ul>
 * <li>solrHome: Local folder where schema.xml and such can be found.</li>
 * <li>converter: An implementation of {@link TupleDocumentConverter} other than {@link DefaultTupleDocumentConverter}
 * if user wants to do complex things like boosting.</li>
 * <li>outputZipFile: If user wants to produce a ZIP with the index.</li>
 * <li>batchSize: Number of documents that will go in each indexing batch.</li>
 * <li>threadCount: Number of threads in a pool that will be used for indexing.</li>
 * <li>queueSize: Maximum number of batches that can be pooled in the batch indexing thread pool.</li>
 * </ul>
 * For a usage example see test class {@link TupleSolrOutputFormatExample}.
 */
@SuppressWarnings("serial")
public class TupleSolrOutputFormat extends FileOutputFormat<ITuple, NullWritable> implements Serializable {

    private static final Log LOG = LogFactory.getLog(TupleSolrOutputFormat.class);

    /**
     * The base name of the zip file containing the configuration information. This file is passed via the distributed
     * cache using a unique name, obtained via {@link #getZipName(Configuration jobConf)}.
     */
    public static final String ZIP_FILE_BASE_NAME = "solr.zip";

    static int defaultSolrWriterThreadCount = 2;
    static int defaultSolrWriterQueueSize = 100;
    static int defaultSolrBatchSize = 20;

    /**
     * Size of the batch size used for indexing
     */
    private int batchSize = defaultSolrBatchSize;
    /**
     * The number of threads that will be used for indexing
     */
    private int threadCount = defaultSolrWriterThreadCount;
    /**
     * The SOLR writer queue size
     */
    private int queueSize = defaultSolrWriterQueueSize;

    /**
     * Whether the output should be a ZIP of the index or not
     */
    private boolean outputZipFile = false;

    /**
     * The zip file name used for sending the SOLR configuration in the DistributedCache
     */
    private String zipName;
    /**
     * The local SOLR home to be used if this is run in Hadoop's local mode (no DC)
     */
    private String localSolrHome;
    /**
     * The document converter. By default, {@link DefaultTupleDocumentConverter}, but users may implement custom ones for
     * boosting, etc.
     */
    private TupleDocumentConverter converter;

    @Override
    public void checkOutputSpecs(JobContext job) throws IOException {
        super.checkOutputSpecs(job);
    }

    @Override
    public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new SolrRecordWriter(batchSize, outputZipFile, threadCount, queueSize, localSolrHome, zipName,
                converter, context);
    }

    public TupleSolrOutputFormat(File solrHome, Configuration hadoopConf) throws IOException {
        setupSolrHomeCache(solrHome, hadoopConf);
        this.converter = new DefaultTupleDocumentConverter();
    }

    public TupleSolrOutputFormat(File solrHome, Configuration hadoopConf, TupleDocumentConverter converter)
            throws IOException {
        this(solrHome, hadoopConf);
        this.converter = converter;
    }

    public TupleSolrOutputFormat(File solrHome, Configuration hadoopConf, TupleDocumentConverter converter,
            boolean outputZipFile, int batchSize, int threadCount, int queueSize) throws IOException {
        this(solrHome, hadoopConf, converter);
        this.outputZipFile = outputZipFile;
        this.batchSize = batchSize;
        this.threadCount = threadCount;
        this.queueSize = queueSize;
    }

    private void setupSolrHomeCache(File solrHome, Configuration conf) throws IOException {
        if (solrHome == null || !(solrHome.exists() && solrHome.isDirectory())) {
            throw new IOException("Invalid solr.home: " + solrHome);
        }
        localSolrHome = solrHome.getAbsolutePath();
        File tmpZip = File.createTempFile("solr", "zip");
        createZip(solrHome, tmpZip);
        // Make a reasonably unique name for the zip file in the distributed cache
        // to avoid collisions if multiple jobs are running.
        String hdfsZipName = UUID.randomUUID().toString() + '.' + ZIP_FILE_BASE_NAME;
        zipName = hdfsZipName;

        Path zipPath = new Path("/tmp", zipName);
        FileSystem fs = FileSystem.get(conf);
        fs.copyFromLocalFile(new Path(tmpZip.toString()), zipPath);
        final URI baseZipUrl = fs.getUri().resolve(zipPath.toString() + '#' + zipName);

        DistributedCache.addCacheArchive(baseZipUrl, conf);
        LOG.info("Set Solr cache: " + Arrays.asList(DistributedCache.getCacheArchives(conf)));
    }

    private static void createZip(File dir, File out) throws IOException {
        HashSet<File> files = new HashSet<File>();
        // take only conf/ and lib/
        for (String allowedDirectory : SolrRecordWriter.getAllowedConfigDirectories()) {
            File configDir = new File(dir, allowedDirectory);
            boolean configDirExists;
            /** If the directory does not exist, and is required, bail out */
            if (!(configDirExists = configDir.exists())
                    && SolrRecordWriter.isRequiredConfigDirectory(allowedDirectory)) {
                throw new IOException(String.format("required configuration directory %s is not present in %s",
                        allowedDirectory, dir));
            }
            if (!configDirExists) {
                continue;
            }
            listFiles(configDir, files); // Store the files in the existing, allowed
                                         // directory configDir, in the list of files
                                         // to store in the zip file
        }

        out.delete();
        int subst = dir.toString().length();
        ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out));
        byte[] buf = new byte[1024];
        for (File f : files) {
            ZipEntry ze = new ZipEntry(f.toString().substring(subst));
            zos.putNextEntry(ze);
            InputStream is = new FileInputStream(f);
            int cnt;
            while ((cnt = is.read(buf)) >= 0) {
                zos.write(buf, 0, cnt);
            }
            is.close();
            zos.flush();
            zos.closeEntry();
        }
        zos.close();
    }

    private static void listFiles(File dir, Set<File> files) throws IOException {
        File[] list = dir.listFiles();
        for (File f : list) {
            if (f.isFile()) {
                files.add(f);
            } else {
                listFiles(f, files);
            }
        }
    }
}