finderbots.recommenders.hadoop.WriteToSolrJob.java Source code

Introduction

Here is the source code for finderbots.recommenders.hadoop.WriteToSolrJob.java
Source

package finderbots.recommenders.hadoop;
/**
 * Licensed to Patrick J. Ferrel (PJF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. PJF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * If someone wants license or copyrights to this let me know
 * pat.ferrel@gmail.com
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/**
 * <p>Writes the DRMs passed in to Solr as csv files to a location in HDFS or the local file system. The Primary DRM is expected to be a item-item similarity matrix with Mahout internal ID. The Secondary DRM is from cross-action-similarities. It also needs a file containing a map of internal mahout IDs to external IDs--one for userIDs and one for itemIDs. It needs the location to put the similarity matrices, each will be put into a Solr fields of type 'string' for indexing.</p>
 * <p>The Solr csv files will be of the form:</p>
 * <p>item_id,similar_items,cross_action_similar_items</p>
 * <p> ipad,iphone,iphone nexus</p>
 * <p> iphone,ipad,ipad galaxy</p>
 * <p>todo: There are two shared in-memory BiHashMaps per node. To remove the in-memory maps a more complex data flow needs to be implemented.</p>
 * <p>todo: Solr and LucidWorks Search support many stores for indexing. It might be nice to have a pluggable writer for different stores.</p>
 */

import com.google.common.collect.BiMap;
import org.apache.commons.lang.builder.ReflectionToStringBuilder;
import org.apache.commons.lang.builder.ToStringStyle;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.mahout.math.Vector;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;

public class WriteToSolrJob extends Configured implements Tool {
    private static Logger LOGGER = Logger.getRootLogger();

    private static Options options;
    FileSystem fs;

    /* This class joins A and B by user ID and writes the data to a set of CSV files with the following headers:
    * id,b_history,a_history
    *
    * It joins [B'B] and [B'A] by item ID and writes the data as a set of csv files with the following headers:
    * id,b_b_links,b_a_links
    */
    @Override
    public int run(String[] args) throws Exception {
        options = new Options();
        CmdLineParser parser = new CmdLineParser(options);
        String s = options.toString();// for debuging ease

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            parser.printUsage(System.err);
            return -1;
        }
        fs = FileSystem.get(getConf());

        cleanOutputDirs();

        Path itemIndexPath = new Path(options.getItemIndexFilePath());
        Path userIndexPath = new Path(options.getUserIndexFilePath());
        Map fields = new HashMap<String, String>();

        //inputs
        Path bTransposeByMatrixPath = new Path(options.getBTransposeBMatrixDir());
        Path bUserHistoryMatrixPath = new Path(options.getBUserHistoryMatrixDir());

        //outputs
        Path solrItemsLinksDocsFilesPath = new Path(options.getSolrItemLinksDocsDir());
        Path solrUserHistoryDocsFilesPath = new Path(options.getSolrUserHistoryDir());

        if (options.getBTransposeAMatrixDir() != null && options.getAUserHistoryMatrixDir() != null) {
            //optional inputs
            Path bTransposeAMatrixPath = new Path(options.getBTransposeAMatrixDir());
            Path aUserHistoryMatrixPath = new Path(options.getAUserHistoryMatrixDir());

            fields.put("iD1", options.getItemIdFieldName());
            fields.put("dRM1FieldName", options.getBTranposeBFieldName());
            fields.put("dRM2FieldName", options.getBTransposeAFieldName());
            WriteDRMsToSolr join = new WriteDRMsToSolr(fields);
            join.joinDRMsWriteToSolr(itemIndexPath, itemIndexPath, bTransposeByMatrixPath, bTransposeAMatrixPath,
                    solrItemsLinksDocsFilesPath);

            fields.clear();
            fields.put("iD1", options.getUserIdFieldName());
            fields.put("dRM1FieldName", options.getBUserHistoryFieldName());
            fields.put("dRM2FieldName", options.getAUserHistoryFieldName());
            join = new WriteDRMsToSolr(fields);
            join.joinDRMsWriteToSolr(userIndexPath, itemIndexPath, bUserHistoryMatrixPath, aUserHistoryMatrixPath,
                    solrUserHistoryDocsFilesPath);
        } else { //only using B actions so no CoGroup join required
            fields.put("iD1", options.getItemIdFieldName());
            fields.put("dRM1FieldName", options.getBTranposeBFieldName());
            WriteDRMsToSolr join = new WriteDRMsToSolr(fields);
            join.writeDRMToSolr(itemIndexPath, itemIndexPath, bTransposeByMatrixPath, solrItemsLinksDocsFilesPath);

            fields.clear();
            fields.put("iD1", options.getUserIdFieldName());
            fields.put("dRM1FieldName", options.getBUserHistoryFieldName());
            join = new WriteDRMsToSolr(fields);
            join.writeDRMToSolr(userIndexPath, itemIndexPath, bUserHistoryMatrixPath, solrUserHistoryDocsFilesPath);
        }
        return 0;
    }

    private String getOrderedItems(Vector v, BiMap<String, String> elementIndex) {
        String doc = new String("");
        //sort the vector by element weight
        class VectorElementComparator implements Comparator<Vector.Element> {

            @Override
            public int compare(Vector.Element o1, Vector.Element o2) {
                return (o1.get() > o2.get() ? -1 : (o1.equals(o2) ? 0 : 1));
            }
        }

        ArrayList<Vector.Element> vel = new ArrayList<Vector.Element>();
        for (Vector.Element ve : v.nonZeroes())
            vel.add(ve);
        Collections.sort(vel, new VectorElementComparator());
        for (Vector.Element ve : vel) {
            int i = ve.index();
            String s = String.valueOf(i);
            String exID = elementIndex.inverse().get(s);
            String intID = elementIndex.get(s);
            doc += exID + " ";
        }
        return doc;
    }

    private void cleanOutputDirs() throws IOException {
        //delete only the ones we want to overwrite
        Path solrItemsSimilaritiesDocsDir = new Path(options.getSolrItemLinksDocsDir());
        Path solrUserHistoryDocsDir = new Path(options.getSolrUserHistoryDir());
        if (fs.exists(solrItemsSimilaritiesDocsDir))
            fs.delete(solrItemsSimilaritiesDocsDir, true);
        if (fs.exists(solrUserHistoryDocsDir))
            fs.delete(solrUserHistoryDocsDir, true);
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(), new WriteToSolrJob(), args);
    }

    // Command line options for this job. Execute the main method above with no parameters
    // to get a help listing.
    //

    public class Options {

        //used by Solr
        private static final String DEFAULT_ITEM_ID_FIELD_NAME = "id";
        private static final String DEFAULT_USER_ID_FIELD_NAME = "id";//id is a Solr required field unique per index
        private static final String DEFAULT_B_TRANSPOSE_B_FIELD_NAME = "b_b_links";
        private static final String DEFAULT_B_TRANSPOSE_A_FIELD_NAME = "b_a_links";
        private static final String DEFAULT_ITEM_INDEX_FILENAME = ActionSplitterJob.Options.DEFAULT_ITEM_INDEX_FILENAME;
        private static final String DEFAULT_USER_INDEX_FILENAME = ActionSplitterJob.Options.DEFAULT_USER_INDEX_FILENAME;
        private static final String DEFAULT_JOINED_LINKS_MATRIX_DIR = "joined-item-links-matrix";
        private static final String DEFAULT_SOLR_ITEM_LINKS_DOCS_DIR = "item-links-docs";
        private static final String DEFAULT_A_HISTORY_FIELD_NAME = "a_history";
        private static final String DEFAULT_B_HISTORY_FIELD_NAME = "b_history";
        private static final String DEFAULT_SOLR_USER_HISTORY_DOCS_DIR = "user-history-docs";
        private static final String DEFAULT_TEMP_DIR = "tmp";
        private String bTransposeBMatrixDir;//required
        private String bTransposeAMatrixDir = "";//optional
        private String aUserHistoryMatrixDir;//required
        private String bUserHistoryMatrixDir;//required
        private String indexesDir;//required
        private String userIndexFilePath;
        private String itemIndexFilePath;
        private String outputDir;//required
        private String itemIdFieldName = DEFAULT_ITEM_ID_FIELD_NAME;
        private String userIdFieldName = DEFAULT_USER_ID_FIELD_NAME;
        private String bTranposeBFieldName = DEFAULT_B_TRANSPOSE_B_FIELD_NAME;
        private String bTransposeAFieldName = DEFAULT_B_TRANSPOSE_A_FIELD_NAME;
        private String bUserHistoryFieldName = DEFAULT_B_HISTORY_FIELD_NAME;
        private String aUserHistoryFieldName = DEFAULT_A_HISTORY_FIELD_NAME;
        private String solrItemLinksDocsDir;//derived from requied output dir
        private String solrItemsLinksDocFilePath;//derived from required stuff
        private String tempDir = DEFAULT_TEMP_DIR;//defaults to output/tmp
        private String solrUserHistoryDir;//derived from required stuff

        Options() {
        }

        public String getTempDir() {
            return tempDir;
        }

        public String getBUserHistoryFieldName() {
            return bUserHistoryFieldName;
        }

        public String getAUserHistoryFieldName() {
            return aUserHistoryFieldName;
        }

        public String getSolrUserHistoryDir() {
            return solrUserHistoryDir;
        }

        @Option(name = "-t", aliases = {
                "--tempDir" }, usage = "Directory for intermediate results (optional). Default: 'output/tmp'.", required = false)
        public void setTempDir(String tempDir) {
            this.tempDir = tempDir;
        }

        public String getBUserHistoryMatrixDir() {
            return bUserHistoryMatrixDir;
        }

        @Option(name = "-upm", aliases = {
                "--usersPrimaryHistoryDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with users' history of primary actions (optional).", required = true)
        public void setBUserHistoryMatrixDir(String bUserHistoryMatrixDir) {
            this.bUserHistoryMatrixDir = bUserHistoryMatrixDir;
        }

        public String getAUserHistoryMatrixDir() {
            return aUserHistoryMatrixDir;
        }

        @Option(name = "-usm", aliases = {
                "--usersSecondaryHistoryDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with users' history of secondary or 'cross' actions (optional).", required = false)
        public void setAUserHistoryMatrixDir(String aUserHistoryMatrixDir) {
            this.aUserHistoryMatrixDir = aUserHistoryMatrixDir;
        }

        public String getItemIdFieldName() {
            return itemIdFieldName;
        }

        public String getUserIdFieldName() {
            return userIdFieldName;
        }

        public String getBTranposeBFieldName() {
            return bTranposeBFieldName;
        }

        public String getBTransposeAFieldName() {
            return bTransposeAFieldName;
        }

        public String getBTransposeBMatrixDir() {
            return bTransposeBMatrixDir;
        }

        @Option(name = "-ism", aliases = {
                "--itemSimilarityMatrixDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with Item-Item similarities for the primary action.", required = true)
        public void setBTransposeBMatrixDir(String bTransposeBMatrixDir) {
            this.bTransposeBMatrixDir = bTransposeBMatrixDir;
        }

        public String getBTransposeAMatrixDir() {
            return bTransposeAMatrixDir;
        }

        @Option(name = "-icsm", aliases = {
                "--itemCrossSimilarityMatrixDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with Item-Item cross-action similarities.", required = false)
        public void setBTransposeAMatrixDir(String bTransposeAMatrixDir) {
            this.bTransposeAMatrixDir = bTransposeAMatrixDir;
        }

        public String getIndexesDir() {
            return indexesDir;
        }

        @Option(name = "-ix", aliases = {
                "--indexDir" }, usage = "Directory containing user and item indexes.", required = true)
        public void setIndexesDir(String indexesDir) {
            this.indexesDir = indexesDir;
            if (this.userIndexFilePath == null)
                userIndexFilePath = new Path(indexesDir, DEFAULT_USER_INDEX_FILENAME).toString();
            if (this.itemIndexFilePath == null)
                itemIndexFilePath = new Path(indexesDir, DEFAULT_ITEM_INDEX_FILENAME).toString();
        }

        public String getUserIndexFilePath() {
            return userIndexFilePath;
        }

        @Option(name = "-uix", aliases = {
                "--userIndex" }, usage = "Input directory containing the serialized BiMap of Mahout ID <-> external ID (optional, overrides --indexDir). Default: indexDir/user-index.", required = false)
        public void setUserIndexFilePath(String userIndexFilePath) {
            this.userIndexFilePath = userIndexFilePath;
        }

        public String getItemIndexFilePath() {
            return itemIndexFilePath;
        }

        @Option(name = "-iix", aliases = {
                "--itemIndex" }, usage = "Input directory containing the serialized BiMap of Mahout ID <-> external ID (optional, overrides --indexDir). Default: indexDir/item-index", required = false)
        public void setItemIndexFilePath(String itemIndexFilePath) {
            this.itemIndexFilePath = itemIndexFilePath;
        }

        public String getOutputDir() {
            return outputDir;
        }

        @Option(name = "-o", aliases = {
                "--output" }, usage = "Where to write docs of ids for indexing. Danger: will be cleaned before writing!", required = true)
        public void setOutputDir(String outputDir) {
            this.outputDir = outputDir;
            this.solrItemLinksDocsDir = new Path(new Path(this.outputDir), DEFAULT_SOLR_ITEM_LINKS_DOCS_DIR)
                    .toString();
            this.solrUserHistoryDir = new Path(new Path(this.outputDir), DEFAULT_SOLR_USER_HISTORY_DOCS_DIR)
                    .toString();

        }

        public String getSolrItemLinksDocsDir() {
            return solrItemLinksDocsDir;
        }

        public String getSolrItemsLinksDocFilePath() {
            return solrItemsLinksDocFilePath;
        }

        /* not needed?
                @Option(name = "-t", aliases = {"--tempDir"}, usage = "Place for intermediate data. Things left after the jobs but erased before starting new ones.", required = false)
                public void setTempDir(String tempDir) {
        this.tempDir = tempDir;
                }
            
                public String getTempDir() {
        return this.tempDir;
                }
            
                */

        @Override
        public String toString() {
            String options = ReflectionToStringBuilder.toString(this, ToStringStyle.MULTI_LINE_STYLE);
            options = options.replaceAll("\n", "\n#");
            Date date = new Date();
            SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy h:mm:ss a");
            String formattedDate = sdf.format(date);
            options = options + "\n# Timestamp for data creation = " + formattedDate;
            return options = new StringBuffer(options).insert(0, "#").toString();
        }
    }

}