Java tutorial
package finderbots.recommenders.hadoop; /** * Licensed to Patrick J. Ferrel (PJF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. PJF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * If someone wants license or copyrights to this let me know * pat.ferrel@gmail.com * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /** * <p>Writes the DRMs passed in to Solr as csv files to a location in HDFS or the local file system. The Primary DRM is expected to be a item-item similarity matrix with Mahout internal ID. The Secondary DRM is from cross-action-similarities. It also needs a file containing a map of internal mahout IDs to external IDs--one for userIDs and one for itemIDs. It needs the location to put the similarity matrices, each will be put into a Solr fields of type 'string' for indexing.</p> * <p>The Solr csv files will be of the form:</p> * <p>item_id,similar_items,cross_action_similar_items</p> * <p> ipad,iphone,iphone nexus</p> * <p> iphone,ipad,ipad galaxy</p> * <p>todo: There are two shared in-memory BiHashMaps per node. To remove the in-memory maps a more complex data flow needs to be implemented.</p> * <p>todo: Solr and LucidWorks Search support many stores for indexing. It might be nice to have a pluggable writer for different stores.</p> */ import com.google.common.collect.BiMap; import org.apache.commons.lang.builder.ReflectionToStringBuilder; import org.apache.commons.lang.builder.ToStringStyle; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.apache.mahout.math.Vector; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.*; public class WriteToSolrJob extends Configured implements Tool { private static Logger LOGGER = Logger.getRootLogger(); private static Options options; FileSystem fs; /* This class joins A and B by user ID and writes the data to a set of CSV files with the following headers: * id,b_history,a_history * * It joins [B'B] and [B'A] by item ID and writes the data as a set of csv files with the following headers: * id,b_b_links,b_a_links */ @Override public int run(String[] args) throws Exception { options = new Options(); CmdLineParser parser = new CmdLineParser(options); String s = options.toString();// for debuging ease try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); return -1; } fs = FileSystem.get(getConf()); cleanOutputDirs(); Path itemIndexPath = new Path(options.getItemIndexFilePath()); Path userIndexPath = new Path(options.getUserIndexFilePath()); Map fields = new HashMap<String, String>(); //inputs Path bTransposeByMatrixPath = new Path(options.getBTransposeBMatrixDir()); Path bUserHistoryMatrixPath = new Path(options.getBUserHistoryMatrixDir()); //outputs Path solrItemsLinksDocsFilesPath = new Path(options.getSolrItemLinksDocsDir()); Path solrUserHistoryDocsFilesPath = new Path(options.getSolrUserHistoryDir()); if (options.getBTransposeAMatrixDir() != null && options.getAUserHistoryMatrixDir() != null) { //optional inputs Path bTransposeAMatrixPath = new Path(options.getBTransposeAMatrixDir()); Path aUserHistoryMatrixPath = new Path(options.getAUserHistoryMatrixDir()); fields.put("iD1", options.getItemIdFieldName()); fields.put("dRM1FieldName", options.getBTranposeBFieldName()); fields.put("dRM2FieldName", options.getBTransposeAFieldName()); WriteDRMsToSolr join = new WriteDRMsToSolr(fields); join.joinDRMsWriteToSolr(itemIndexPath, itemIndexPath, bTransposeByMatrixPath, bTransposeAMatrixPath, solrItemsLinksDocsFilesPath); fields.clear(); fields.put("iD1", options.getUserIdFieldName()); fields.put("dRM1FieldName", options.getBUserHistoryFieldName()); fields.put("dRM2FieldName", options.getAUserHistoryFieldName()); join = new WriteDRMsToSolr(fields); join.joinDRMsWriteToSolr(userIndexPath, itemIndexPath, bUserHistoryMatrixPath, aUserHistoryMatrixPath, solrUserHistoryDocsFilesPath); } else { //only using B actions so no CoGroup join required fields.put("iD1", options.getItemIdFieldName()); fields.put("dRM1FieldName", options.getBTranposeBFieldName()); WriteDRMsToSolr join = new WriteDRMsToSolr(fields); join.writeDRMToSolr(itemIndexPath, itemIndexPath, bTransposeByMatrixPath, solrItemsLinksDocsFilesPath); fields.clear(); fields.put("iD1", options.getUserIdFieldName()); fields.put("dRM1FieldName", options.getBUserHistoryFieldName()); join = new WriteDRMsToSolr(fields); join.writeDRMToSolr(userIndexPath, itemIndexPath, bUserHistoryMatrixPath, solrUserHistoryDocsFilesPath); } return 0; } private String getOrderedItems(Vector v, BiMap<String, String> elementIndex) { String doc = new String(""); //sort the vector by element weight class VectorElementComparator implements Comparator<Vector.Element> { @Override public int compare(Vector.Element o1, Vector.Element o2) { return (o1.get() > o2.get() ? -1 : (o1.equals(o2) ? 0 : 1)); } } ArrayList<Vector.Element> vel = new ArrayList<Vector.Element>(); for (Vector.Element ve : v.nonZeroes()) vel.add(ve); Collections.sort(vel, new VectorElementComparator()); for (Vector.Element ve : vel) { int i = ve.index(); String s = String.valueOf(i); String exID = elementIndex.inverse().get(s); String intID = elementIndex.get(s); doc += exID + " "; } return doc; } private void cleanOutputDirs() throws IOException { //delete only the ones we want to overwrite Path solrItemsSimilaritiesDocsDir = new Path(options.getSolrItemLinksDocsDir()); Path solrUserHistoryDocsDir = new Path(options.getSolrUserHistoryDir()); if (fs.exists(solrItemsSimilaritiesDocsDir)) fs.delete(solrItemsSimilaritiesDocsDir, true); if (fs.exists(solrUserHistoryDocsDir)) fs.delete(solrUserHistoryDocsDir, true); } public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new WriteToSolrJob(), args); } // Command line options for this job. Execute the main method above with no parameters // to get a help listing. // public class Options { //used by Solr private static final String DEFAULT_ITEM_ID_FIELD_NAME = "id"; private static final String DEFAULT_USER_ID_FIELD_NAME = "id";//id is a Solr required field unique per index private static final String DEFAULT_B_TRANSPOSE_B_FIELD_NAME = "b_b_links"; private static final String DEFAULT_B_TRANSPOSE_A_FIELD_NAME = "b_a_links"; private static final String DEFAULT_ITEM_INDEX_FILENAME = ActionSplitterJob.Options.DEFAULT_ITEM_INDEX_FILENAME; private static final String DEFAULT_USER_INDEX_FILENAME = ActionSplitterJob.Options.DEFAULT_USER_INDEX_FILENAME; private static final String DEFAULT_JOINED_LINKS_MATRIX_DIR = "joined-item-links-matrix"; private static final String DEFAULT_SOLR_ITEM_LINKS_DOCS_DIR = "item-links-docs"; private static final String DEFAULT_A_HISTORY_FIELD_NAME = "a_history"; private static final String DEFAULT_B_HISTORY_FIELD_NAME = "b_history"; private static final String DEFAULT_SOLR_USER_HISTORY_DOCS_DIR = "user-history-docs"; private static final String DEFAULT_TEMP_DIR = "tmp"; private String bTransposeBMatrixDir;//required private String bTransposeAMatrixDir = "";//optional private String aUserHistoryMatrixDir;//required private String bUserHistoryMatrixDir;//required private String indexesDir;//required private String userIndexFilePath; private String itemIndexFilePath; private String outputDir;//required private String itemIdFieldName = DEFAULT_ITEM_ID_FIELD_NAME; private String userIdFieldName = DEFAULT_USER_ID_FIELD_NAME; private String bTranposeBFieldName = DEFAULT_B_TRANSPOSE_B_FIELD_NAME; private String bTransposeAFieldName = DEFAULT_B_TRANSPOSE_A_FIELD_NAME; private String bUserHistoryFieldName = DEFAULT_B_HISTORY_FIELD_NAME; private String aUserHistoryFieldName = DEFAULT_A_HISTORY_FIELD_NAME; private String solrItemLinksDocsDir;//derived from requied output dir private String solrItemsLinksDocFilePath;//derived from required stuff private String tempDir = DEFAULT_TEMP_DIR;//defaults to output/tmp private String solrUserHistoryDir;//derived from required stuff Options() { } public String getTempDir() { return tempDir; } public String getBUserHistoryFieldName() { return bUserHistoryFieldName; } public String getAUserHistoryFieldName() { return aUserHistoryFieldName; } public String getSolrUserHistoryDir() { return solrUserHistoryDir; } @Option(name = "-t", aliases = { "--tempDir" }, usage = "Directory for intermediate results (optional). Default: 'output/tmp'.", required = false) public void setTempDir(String tempDir) { this.tempDir = tempDir; } public String getBUserHistoryMatrixDir() { return bUserHistoryMatrixDir; } @Option(name = "-upm", aliases = { "--usersPrimaryHistoryDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with users' history of primary actions (optional).", required = true) public void setBUserHistoryMatrixDir(String bUserHistoryMatrixDir) { this.bUserHistoryMatrixDir = bUserHistoryMatrixDir; } public String getAUserHistoryMatrixDir() { return aUserHistoryMatrixDir; } @Option(name = "-usm", aliases = { "--usersSecondaryHistoryDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with users' history of secondary or 'cross' actions (optional).", required = false) public void setAUserHistoryMatrixDir(String aUserHistoryMatrixDir) { this.aUserHistoryMatrixDir = aUserHistoryMatrixDir; } public String getItemIdFieldName() { return itemIdFieldName; } public String getUserIdFieldName() { return userIdFieldName; } public String getBTranposeBFieldName() { return bTranposeBFieldName; } public String getBTransposeAFieldName() { return bTransposeAFieldName; } public String getBTransposeBMatrixDir() { return bTransposeBMatrixDir; } @Option(name = "-ism", aliases = { "--itemSimilarityMatrixDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with Item-Item similarities for the primary action.", required = true) public void setBTransposeBMatrixDir(String bTransposeBMatrixDir) { this.bTransposeBMatrixDir = bTransposeBMatrixDir; } public String getBTransposeAMatrixDir() { return bTransposeAMatrixDir; } @Option(name = "-icsm", aliases = { "--itemCrossSimilarityMatrixDir" }, usage = "Input directory containing the Mahout DistributedRowMatrix with Item-Item cross-action similarities.", required = false) public void setBTransposeAMatrixDir(String bTransposeAMatrixDir) { this.bTransposeAMatrixDir = bTransposeAMatrixDir; } public String getIndexesDir() { return indexesDir; } @Option(name = "-ix", aliases = { "--indexDir" }, usage = "Directory containing user and item indexes.", required = true) public void setIndexesDir(String indexesDir) { this.indexesDir = indexesDir; if (this.userIndexFilePath == null) userIndexFilePath = new Path(indexesDir, DEFAULT_USER_INDEX_FILENAME).toString(); if (this.itemIndexFilePath == null) itemIndexFilePath = new Path(indexesDir, DEFAULT_ITEM_INDEX_FILENAME).toString(); } public String getUserIndexFilePath() { return userIndexFilePath; } @Option(name = "-uix", aliases = { "--userIndex" }, usage = "Input directory containing the serialized BiMap of Mahout ID <-> external ID (optional, overrides --indexDir). Default: indexDir/user-index.", required = false) public void setUserIndexFilePath(String userIndexFilePath) { this.userIndexFilePath = userIndexFilePath; } public String getItemIndexFilePath() { return itemIndexFilePath; } @Option(name = "-iix", aliases = { "--itemIndex" }, usage = "Input directory containing the serialized BiMap of Mahout ID <-> external ID (optional, overrides --indexDir). Default: indexDir/item-index", required = false) public void setItemIndexFilePath(String itemIndexFilePath) { this.itemIndexFilePath = itemIndexFilePath; } public String getOutputDir() { return outputDir; } @Option(name = "-o", aliases = { "--output" }, usage = "Where to write docs of ids for indexing. Danger: will be cleaned before writing!", required = true) public void setOutputDir(String outputDir) { this.outputDir = outputDir; this.solrItemLinksDocsDir = new Path(new Path(this.outputDir), DEFAULT_SOLR_ITEM_LINKS_DOCS_DIR) .toString(); this.solrUserHistoryDir = new Path(new Path(this.outputDir), DEFAULT_SOLR_USER_HISTORY_DOCS_DIR) .toString(); } public String getSolrItemLinksDocsDir() { return solrItemLinksDocsDir; } public String getSolrItemsLinksDocFilePath() { return solrItemsLinksDocFilePath; } /* not needed? @Option(name = "-t", aliases = {"--tempDir"}, usage = "Place for intermediate data. Things left after the jobs but erased before starting new ones.", required = false) public void setTempDir(String tempDir) { this.tempDir = tempDir; } public String getTempDir() { return this.tempDir; } */ @Override public String toString() { String options = ReflectionToStringBuilder.toString(this, ToStringStyle.MULTI_LINE_STYLE); options = options.replaceAll("\n", "\n#"); Date date = new Date(); SimpleDateFormat sdf = new SimpleDateFormat("MM/dd/yyyy h:mm:ss a"); String formattedDate = sdf.format(date); options = options + "\n# Timestamp for data creation = " + formattedDate; return options = new StringBuffer(options).insert(0, "#").toString(); } } }