Java tutorial
package de.l3s.streamcorpus.mapreduce; /* * Terrier - Terabyte Retriever * Webpage: http://terrier.org * Contact: terrier{a.}dcs.gla.ac.uk * University of Glasgow - School of Computing Science * http://www.gla.uk * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is HadoopIndexing.java. * * The Original Code is Copyright (C) 2004-2014 the University of Glasgow. * All Rights Reserved. * * Contributor(s): * Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author) * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> */ import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobID; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.terrier.structures.BitIndexPointer; import org.terrier.structures.FSOMapFileLexiconOutputStream; import org.terrier.structures.FieldLexiconEntry; import org.terrier.structures.Index; import org.terrier.structures.IndexOnDisk; import org.terrier.structures.IndexUtil; import org.terrier.structures.LexiconEntry; import org.terrier.structures.LexiconOutputStream; import org.terrier.structures.bit.BitPostingIndexInputStream; import org.terrier.structures.indexing.CompressionFactory; import org.terrier.structures.indexing.LexiconBuilder; import org.terrier.structures.indexing.CompressionFactory.BitCompressionConfiguration; import org.terrier.structures.indexing.singlepass.hadoop.Hadoop_BasicSinglePassIndexer; import org.terrier.structures.indexing.singlepass.hadoop.Hadoop_BlockSinglePassIndexer; import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList; import org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat; import org.terrier.structures.indexing.singlepass.hadoop.SplitEmittedTerm; import org.terrier.structures.seralization.FixedSizeWriteableFactory; import org.terrier.utility.ApplicationSetup; import org.terrier.utility.FieldScore; import org.terrier.utility.Files; import org.terrier.utility.TerrierTimer; import org.terrier.utility.io.HadoopPlugin; import org.terrier.utility.io.HadoopUtility; /** * Main run class for the MapReduce indexing system. * Provides facilities to preform indexing over multiple * machines in a MapReduce cluster. * <p><h3>Input</h3> * The collection is assumed to be a list of files, as specified in the collection.spec. For more advanced collections, * this class will be need to be changed. The files listed in collection.spec are assumed to be on the Hadoop shared default * filesystem - usually HDFS (else Hadoop will throw an error). * </p> * <p><h3>Output</h3> * This class creates indices for the indexed collection, in the directory specified by <tt>terrier.index.path</tt>. If this * folder is NOT on the Hadoop shared default (e.g. HDFS), then Hadoop will throw an error. * </p> * <p> * <h3>Reducers</h3> * Two reduce modes are supported: <i>term-partitioning</i> creates * a single index with multiple files making up the inverted structure; <i>document-partitioning</i> * creates mulitiple indices, partitioned by docid. More reduce tasks results in higher indexing * speed due to greater concurrency. * <p> * Term-partitioning is the default scenario. In this scenario, the maximum reducers allowed is * 32. To select document-partitioning, specify the -p flag to main(); * <p> * <b>Properties:</b> * <ul> * <li><tt>terrier.hadoop.indexing.reducers</tt> - number of reduce tasks, defaults to 26.</li> * <li>If <tt>block.indexing</tt> is set, then a block index will be created.</li> * </ul> * * @author Richard McCreadie and Craig Macdonald * @since 2.2 */ @SuppressWarnings("deprecation") public class TerrierIndexing extends Configured implements Tool { static final int MAX_REDUCE = 26; /** logger for this class */ protected static final Logger logger = Logger.getLogger(TerrierIndexing.class); /*private static String usage() { return "Usage: HadoopIndexing [-p]"; }*/ /** Starts the MapReduce indexing. * @param args * @throws Exception */ public int run(String[] args) throws Exception { long time = System.currentTimeMillis(); // For the moment: Hard-code the terrier home to quick test System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer"); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return 0; } else if (args.length == 0) { logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } /*else { logger.fatal(usage()); return 0; }*/ if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return 0; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJarByClass(TerrierIndexing.class); conf.setJobName("StreamCorpusIndexer: Terrier Indexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 0; } // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; boolean blockIndexing = true; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); // not sure if this is effective in YARN conf.setNumMapTasks(2000); // increase the heap usage conf.set("mapreduce.map.memory.mb", "6100"); conf.set("mapred.job.map.memory.mb", "6100"); conf.set("mapreduce.reduce.memory.mb", "6144"); conf.set("mapred.job.reduce.memory.mb", "6144"); conf.set("mapreduce.map.java.opts", "-Xmx6100m"); conf.set("mapred.map.child.java.opts", "-Xmx6100m"); conf.set("mapreduce.reduce.java.opts", "-Xmx6144m"); conf.set("mapred.reduce.child.opts", "-Xmx6144m"); //conf.setBoolean("mapred.used.genericoptionsparser", true) ; // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it conf.set("mapreduce.job.user.classpath.first", "true"); // increase the yarn memory to 10 GB conf.set("yarn.nodemanager.resource.memory-mb", "12288"); conf.set("yarn.nodemanager.resource.cpu-vcores", "16"); conf.set("yarn.scheduler.minimum-allocation-mb", "4096"); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } /*JobID jobId = null; boolean ranOK = true; try{ RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); e.printStackTrace(); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } */ //if (ranOK) //{ System.out.println("Merging indices"); if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); //} System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); return 0; } public static void main(String[] args) { try { ToolRunner.run(new TerrierIndexing(), args); } catch (Exception e) { e.printStackTrace(); } } /** for term partitioned indexing, this method merges the lexicons from each reducer * @param index_path path of index * @param numberOfReducers number of inverted files expected */ @SuppressWarnings("unchecked") protected static void mergeLexiconInvertedFiles(String index_path, int numberOfReducers) throws IOException { final String lexiconStructure = "lexicon"; final String tmpLexiconStructure = "newlex"; final String invertedStructure = "inverted"; logger.debug("Merging lexicons"); //we're handling indices as streams, so dont need to load it. but remember previous status //moreover, our indices dont have document objects, so errors may occur in preloading final boolean indexProfile = Index.getIndexLoadingProfileAsRetrieval(); Index.setIndexLoadingProfileAsRetrieval(false); //1. load in the input indices final Index[] srcIndices = new Index[numberOfReducers]; final boolean[] existsIndices = new boolean[numberOfReducers]; Arrays.fill(existsIndices, true); int terms = 0; for (int i = 0; i < numberOfReducers; i++) { final String index_prefix = ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + i; srcIndices[i] = Index.createIndex(index_path, index_prefix); if (srcIndices[i] == null) { //remove any empty inverted file for this segment Files.delete(BitPostingIndexInputStream.getFilename(index_path, index_prefix, invertedStructure, (byte) 1, (byte) 1)); //remember that this index doesnt exist existsIndices[i] = false; logger.warn( "No reduce " + i + " output : no output index [" + index_path + "," + index_prefix + "]"); } else { terms += srcIndices[i].getCollectionStatistics().getNumberOfUniqueTerms(); } } //2. the target index is the first source index Index dest = srcIndices[0] != null ? srcIndices[0] : Index.createIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + 0); if (dest == null) { throw new IllegalArgumentException( "No index found at " + index_path + "," + ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + 0); } //3. create the new lexicon LexiconOutputStream<String> lexOut = new FSOMapFileLexiconOutputStream((IndexOnDisk) dest, tmpLexiconStructure, (FixedSizeWriteableFactory<Text>) dest.getIndexStructure(lexiconStructure + "-keyfactory"), (Class<? extends FixedSizeWriteableFactory<LexiconEntry>>) dest .getIndexStructure(lexiconStructure + "-valuefactory").getClass()); //4. append each source lexicon on to the new lexicon, amending the filenumber as we go TerrierTimer tt = new TerrierTimer("Merging lexicon entries", terms); tt.start(); int termId = 0; try { for (int i = 0; i < numberOfReducers; i++) { //the partition did not have any stuff if (!existsIndices[i]) { //touch an empty inverted index file for this segment, as BitPostingIndex requires that all of the files exist Files.writeFileStream(BitPostingIndexInputStream.getFilename((IndexOnDisk) dest, invertedStructure, (byte) numberOfReducers, (byte) i)).close(); continue; } //else, append the lexicon Iterator<Map.Entry<String, LexiconEntry>> lexIn = (Iterator<Map.Entry<String, LexiconEntry>>) srcIndices[i] .getIndexStructureInputStream("lexicon"); while (lexIn.hasNext()) { Map.Entry<String, LexiconEntry> e = lexIn.next(); e.getValue().setTermId(termId); ((BitIndexPointer) e.getValue()).setFileNumber((byte) i); try { lexOut.writeNextEntry(e.getKey(), e.getValue()); termId++; } catch (Exception ex) { logger.warn("One entry cannot be written. It's okay, just get over it !! "); } } IndexUtil.close(lexIn); //rename the inverted file to be part of the destination index Files.rename( BitPostingIndexInputStream.getFilename((IndexOnDisk) srcIndices[i], invertedStructure, (byte) 1, (byte) 1), BitPostingIndexInputStream.getFilename((IndexOnDisk) dest, invertedStructure, (byte) numberOfReducers, (byte) i)); tt.increment(); } } finally { tt.finished(); } lexOut.close(); logger.debug("Structure cleanups"); //5. change over lexicon structures final String[] structureSuffices = new String[] { "", "-entry-inputstream" }; //remove old lexicon structures for (String suffix : structureSuffices) { if (!IndexUtil.deleteStructure(dest, lexiconStructure + suffix)) logger.warn("Structure " + lexiconStructure + suffix + " not found when removing"); } //rename new lexicon structures for (String suffix : structureSuffices) { if (!IndexUtil.renameIndexStructure(dest, tmpLexiconStructure + suffix, lexiconStructure + suffix)) logger.warn("Structure " + tmpLexiconStructure + suffix + " not found when renaming"); } IndexUtil.deleteStructure(dest, tmpLexiconStructure + "-valuefactory"); //6. update destination index if (FieldScore.FIELDS_COUNT > 0) dest.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}"); dest.setIndexProperty("index." + invertedStructure + ".data-files", "" + numberOfReducers); LexiconBuilder.optimise((IndexOnDisk) dest, lexiconStructure); dest.flush(); //7. close source and dest indices for (Index src : srcIndices) //dest is also closed { if (src != null) src.close(); } //8. rearrange indices into desired layout //rename target index IndexUtil.renameIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + 0, index_path, ApplicationSetup.TERRIER_INDEX_PREFIX); //delete other source indices for (int i = 1; i < numberOfReducers; i++) { if (existsIndices[i]) IndexUtil.deleteIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + i); } //restore loading profile Index.setIndexLoadingProfileAsRetrieval(indexProfile); } /** Performs cleanup of an index path removing temporary files */ public static void deleteTaskFiles(String path, JobID job) { String[] fileNames = Files.list(path); if (fileNames == null) return; for (String filename : fileNames) { String periodParts[] = filename.split("\\."); try { TaskID tid = TaskID.forName(periodParts[0]); if (tid.getJobID().equals(job)) { if (!Files.delete(path + "/" + filename)) logger.warn("Could not delete temporary map side-effect file " + path + "/" + filename); } } catch (Exception e) { } } } }