Java tutorial
/** * ImageTerrier - The Terabyte Retriever for Images * Webpage: http://www.imageterrier.org/ * Contact: jsh2@ecs.soton.ac.uk * Electronics and Computer Science, University of Southampton * http://www.ecs.soton.ac.uk/ * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is HadoopIndexer.java * * The Original Code is Copyright (C) 2011 the University of Southampton * and the original contributors. * All Rights Reserved. * * Contributor(s): * Jonathon Hare <jsh2@ecs.soton.ac.uk> (original contributor) * Sina Samangooei <ss@ecs.soton.ac.uk> * David Dupplaw <dpd@ecs.soton.ac.uk> */ package org.imageterrier.indexers.hadoop; import gnu.trove.TLongArrayList; import java.io.IOException; import java.util.Arrays; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.imageterrier.basictools.BasicTerrierConfig; import org.imageterrier.hadoop.mapreduce.PositionAwareSequenceFileInputFormat; import org.imageterrier.hadoop.mapreduce.PositionAwareSplitWrapper; import org.imageterrier.locfile.QLFDocument; import org.imageterrier.toolopts.InputMode; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.openimaj.feature.local.LocalFeature; import org.openimaj.feature.local.list.LocalFeatureList; import org.openimaj.feature.local.list.MemoryLocalFeatureList; import org.openimaj.feature.local.quantised.QuantisedLocalFeature; import org.openimaj.hadoop.tools.clusterquantiser.HadoopClusterQuantiserOptions; import org.openimaj.io.IOUtils; import org.openimaj.ml.clustering.ByteCentroidsResult; import org.openimaj.ml.clustering.IntCentroidsResult; import org.openimaj.ml.clustering.SpatialClusters; import org.openimaj.ml.clustering.assignment.HardAssigner; import org.openimaj.ml.clustering.assignment.hard.KDTreeByteEuclideanAssigner; import org.openimaj.ml.clustering.assignment.hard.KDTreeIntEuclideanAssigner; import org.terrier.indexing.AbstractHadoopIndexer; import org.terrier.indexing.Document; import org.terrier.indexing.ExtensibleSinglePassIndexer; import org.terrier.indexing.HadoopIndexerMapper; import org.terrier.indexing.HadoopIndexerReducer; import org.terrier.structures.Index; import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList; import org.terrier.structures.indexing.singlepass.hadoop.NewSplitEmittedTerm; import org.terrier.utility.ApplicationSetup; import org.terrier.utility.Files; /** * @author Jonathon Hare * */ public class HadoopIndexer extends AbstractHadoopIndexer { static { // initialise terrier BasicTerrierConfig.configure(); ApplicationSetup.setProperty("indexer.meta.forward.keylens", "200"); } protected static final Logger logger = Logger.getLogger(HadoopIndexer.class); public static final String INDEXER_ARGS_STRING = "indexer.args"; public static final String QUANTISER_SIZE = "indexer.quantiser.size"; /** * The mapper implementation for direct quantised feature indexing */ static class QFIndexerMapper extends HadoopIndexerMapper<BytesWritable> { protected Class<? extends QuantisedLocalFeature<?>> featureClass; protected HadoopIndexerOptions options; @Override protected ExtensibleSinglePassIndexer createIndexer(Context context) throws IOException { options = getOptions(context.getConfiguration()); featureClass = options.getFeatureClass(); return options.getIndexType().getIndexer(null, null); } @SuppressWarnings({ "unchecked", "rawtypes" }) @Override protected Document recordToDocument(Text key, BytesWritable value) throws IOException { return new QLFDocument(value.getBytes(), featureClass, key.toString(), null); } } /** * Mapper implementation for directly processing images, that is safe to use * with a MultithreadedMapper. Each MultithreadedMapper thread will produce * its own index. */ static class MTImageIndexerMapper extends HadoopIndexerMapper<BytesWritable> { protected static Class<? extends QuantisedLocalFeature<?>> featureClass; protected static HadoopIndexerOptions options; private static SpatialClusters<?> clusters; private static HardAssigner<?, ?, ?> assigner; private static TLongArrayList threads = new TLongArrayList(); private int threadID; private static synchronized ExtensibleSinglePassIndexer setupIndexer(Context context) throws IOException { if (clusters == null) { options = getOptions(context.getConfiguration()); featureClass = options.getFeatureClass(); System.out.println("Loading codebook..."); final String codebookURL = options.getInputModeOptions().getQuantiserFile(); options.getInputModeOptions().quantiserTypeOp = HadoopClusterQuantiserOptions .sniffClusterType(codebookURL); if (options.getInputModeOptions().getQuantiserType() != null) clusters = IOUtils.read(HadoopClusterQuantiserOptions.getClusterInputStream(codebookURL), options.getInputModeOptions().getQuantiserType().getClusterClass()); if (!options.getInputModeOptions().quantiserExact) { assigner = clusters.defaultHardAssigner(); } else { if (clusters instanceof ByteCentroidsResult) assigner = new KDTreeByteEuclideanAssigner((ByteCentroidsResult) clusters); else if (clusters instanceof IntCentroidsResult) assigner = new KDTreeIntEuclideanAssigner((IntCentroidsResult) clusters); else assigner = clusters.defaultHardAssigner(); } System.out.println("Done!"); } return options.getIndexType().getIndexer(null, null); } @Override protected ExtensibleSinglePassIndexer createIndexer(Context context) throws IOException { synchronized (MTImageIndexerMapper.class) { final long id = Thread.currentThread().getId(); if (!threads.contains(id)) { threads.add(id); this.threadID = threads.indexOf(id); } } return setupIndexer(context); } @Override protected int getSplitNum(Context context) { // Splitno is required by the reducer to be unique per mapper (in // particular in the .runs files) // we modify the splitnos for each thread to allow this to work try { if (((Class<?>) context.getMapperClass()) == ((Class<?>) (MultithreadedMapper.class))) { final int sidx = ((PositionAwareSplitWrapper<?>) context.getInputSplit()).getSplitIndex(); return (sidx * MultithreadedMapper.getNumberOfThreads(context)) + threadID; } } catch (final ClassNotFoundException e) { } return ((PositionAwareSplitWrapper<?>) context.getInputSplit()).getSplitIndex(); } @Override protected String getTaskID(Context context) { // the task id is used to name the shard. we modify it per thread to // allow each thread to // work on its own shard. try { if (((Class<?>) context.getMapperClass()) == ((Class<?>) (MultithreadedMapper.class))) { return context.getTaskAttemptID().getTaskID().toString() + threadID; } } catch (final ClassNotFoundException e) { } return context.getTaskAttemptID().getTaskID().toString(); } @SuppressWarnings({ "unchecked", "rawtypes" }) @Override protected Document recordToDocument(Text key, BytesWritable value) throws IOException { // extract features LocalFeatureList<? extends LocalFeature<?, ?>> features = null; try { logger.info("Extracting features..."); features = options.getInputModeOptions().getFeatureType().extract(value.getBytes()); logger.info("Quantising features..."); // quantise features final LocalFeatureList<QuantisedLocalFeature<?>> qkeys = new MemoryLocalFeatureList<QuantisedLocalFeature<?>>( features.size()); if (clusters.getClass().getName().contains("Byte")) { for (final LocalFeature k : features) { final int id = ((HardAssigner<byte[], ?, ?>) assigner) .assign((byte[]) k.getFeatureVector().getVector()); qkeys.add(new QuantisedLocalFeature(k.getLocation(), id)); } } else { for (final LocalFeature k : features) { final int id = ((HardAssigner<int[], ?, ?>) assigner) .assign((int[]) k.getFeatureVector().getVector()); qkeys.add(new QuantisedLocalFeature(k.getLocation(), id)); } } logger.info("Construcing QLFDocument..."); // create document return new QLFDocument(qkeys, key.toString().substring(0, Math.min(key.getLength(), 20)), null); // FIXME // sort // out // key // length } catch (final Throwable e) { logger.warn("Skipping image: " + key + " due to: " + e.getMessage()); return null; } } } /** * Mapper implementation that uses multiple threads to process images into * visual terms and then emits them to the indexer */ static class ImageIndexerMapper extends HadoopIndexerMapper<BytesWritable> { protected Class<? extends QuantisedLocalFeature<?>> featureClass; protected HadoopIndexerOptions options; private ExecutorService service; private static HardAssigner<?, ?, ?> assigner; @Override protected void map(Text key, BytesWritable value, final Context context) throws IOException, InterruptedException { final Text innerkey = new Text(key.toString()); final BytesWritable innervalue = new BytesWritable(Arrays.copyOf(value.getBytes(), value.getLength())); final Callable<Boolean> r = new Callable<Boolean>() { @Override public Boolean call() throws IOException { // final String docno = innerkey.toString(); final Document doc = recordToDocument(innerkey, innervalue); if (doc == null) return false; // long t1 = System.nanoTime(); // synchronized (ImageIndexerMapper.this) { // long t2 = System.nanoTime(); // // System.out.println("Spent " + ((t2-t1)*(1.0e-9)) + // "s waiting for lock!"); // // context.setStatus("Currently indexing "+docno); // // indexDocument(doc, context); // context.getCounter(Counters.INDEXED_DOCUMENTS).increment(1); // } return true; } }; service.submit(r); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { service.shutdown(); logger.info("Waiting for mapper threads to finish"); service.awaitTermination(1, TimeUnit.DAYS); logger.info("Mapper threads finished. Cleaning up."); super.cleanup(context); } @Override protected ExtensibleSinglePassIndexer createIndexer(Context context) throws IOException { options = getOptions(context.getConfiguration()); featureClass = options.getFeatureClass(); // load quantiser if required loadQuantiser(options, true); // set up threadpool final int nThreads = options.getMultithread(); service = new ThreadPoolExecutor(nThreads, nThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(nThreads) { // the ThreadPoolExecutor calls offer() on the backing // queue, which unfortunately // doesn't block, and we end up getting exceptions // because the job could not // be executed. This works around the problem by making // offer() block (by calling put()). private static final long serialVersionUID = 1L; @Override public boolean offer(Runnable e) { // turn offer() and add() into a blocking calls // (unless // interrupted) try { put(e); return true; } catch (final InterruptedException ie) { Thread.currentThread().interrupt(); } return false; } }); return options.getIndexType().getIndexer(null, null); } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override protected Document recordToDocument(Text key, BytesWritable value) throws IOException { // extract features LocalFeatureList<? extends LocalFeature<?, ?>> features = null; try { logger.info("Extracting features..."); features = options.getInputModeOptions().getFeatureType().extract(value.getBytes()); logger.info("Quantising features..."); // quantise features final LocalFeatureList<QuantisedLocalFeature<?>> qkeys = new MemoryLocalFeatureList<QuantisedLocalFeature<?>>( features.size()); if (assigner.getClass().getName().contains("Byte")) { for (final LocalFeature k : features) { final int id = ((HardAssigner<byte[], ?, ?>) assigner) .assign((byte[]) k.getFeatureVector().getVector()); qkeys.add(new QuantisedLocalFeature(k.getLocation(), id)); } } else { for (final LocalFeature k : features) { final int id = ((HardAssigner<int[], ?, ?>) assigner) .assign((int[]) k.getFeatureVector().getVector()); qkeys.add(new QuantisedLocalFeature(k.getLocation(), id)); } } logger.info("Construcing QLFDocument..."); // create document return new QLFDocument(qkeys, key.toString().substring(0, Math.min(key.getLength(), 20)), null); // FIXME // sort // out // key // length } catch (final Throwable e) { logger.warn("Skipping image: " + key + " due to: " + e.getMessage()); return null; } } private static synchronized void loadQuantiser(HadoopIndexerOptions options, boolean optimise) throws IOException { if (assigner == null) { assigner = readQuantiser(options, readClusters(options)); } } } protected static SpatialClusters<?> readClusters(HadoopIndexerOptions options) throws IOException { SpatialClusters<?> clusters = null; System.out.println("Loading codebook..."); final String codebookURL = options.getInputModeOptions().getQuantiserFile(); options.getInputModeOptions().quantiserTypeOp = HadoopClusterQuantiserOptions.sniffClusterType(codebookURL); if (options.getInputModeOptions().getQuantiserType() != null) { clusters = IOUtils.read(HadoopClusterQuantiserOptions.getClusterInputStream(codebookURL), options.getInputModeOptions().getQuantiserType().getClusterClass()); } return clusters; } protected static HardAssigner<?, ?, ?> readQuantiser(HadoopIndexerOptions options, SpatialClusters<?> clusters) throws IOException { HardAssigner<?, ?, ?> assigner = null; if (!options.getInputModeOptions().quantiserExact) { assigner = clusters.defaultHardAssigner(); } else { if (clusters instanceof ByteCentroidsResult) assigner = new KDTreeByteEuclideanAssigner((ByteCentroidsResult) clusters); else if (clusters instanceof IntCentroidsResult) assigner = new KDTreeIntEuclideanAssigner((IntCentroidsResult) clusters); else assigner = clusters.defaultHardAssigner(); } System.out.println("Done!"); return assigner; } /** * The reducer implementation */ static class IndexerReducer extends HadoopIndexerReducer { @Override protected ExtensibleSinglePassIndexer createIndexer(Context context) throws IOException { return getOptions(context.getConfiguration()).getIndexType().getIndexer(null, null); } } private static HadoopIndexerOptions getOptions(Configuration conf) throws IOException { final String[] args = conf.getStrings(INDEXER_ARGS_STRING); final HadoopIndexerOptions options = new HadoopIndexerOptions(); final CmdLineParser parser = new CmdLineParser(options); try { parser.parseArgument(args); } catch (final CmdLineException e) { throw new IOException(e); } return options; } private static final String usage() { return "Usage: HadoopIndexing [-p]"; } protected Job createJob(HadoopIndexerOptions options) throws IOException { final Job job = new Job(getConf()); job.setJobName("terrierIndexing"); if (options.getInputMode() == InputMode.QUANTISED_FEATURES) { job.setMapperClass(QFIndexerMapper.class); } else { if (options.shardPerThread) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class); MultithreadedMapper.setNumberOfThreads(job, options.getMultithread()); } else { job.setMapperClass(ImageIndexerMapper.class); } } // Load quantiser (if it exists), extract header, count codebook size if (options.getInputModeOptions().hasQuantiserFile()) { final String quantFile = options.getInputModeOptions().getQuantiserFile(); System.out.println("Loading codebook to see its size"); final SpatialClusters<?> quantiser = readClusters(options); System.out.println("Setting codebook size: " + quantiser.numClusters()); job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters()); if (quantiser.numClusters() < options.getNumReducers()) options.setNumReducers(quantiser.numClusters()); } job.setReducerClass(IndexerReducer.class); FileOutputFormat.setOutputPath(job, options.getOutputPath()); job.setMapOutputKeyClass(NewSplitEmittedTerm.class); job.setMapOutputValueClass(MapEmittedPostingList.class); job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode()); // if // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) { // job.getConfiguration().set("mapred.map.output.compression.codec", // GzipCodec.class.getCanonicalName()); // job.getConfiguration().setBoolean("mapred.compress.map.output", // true); // } else { job.getConfiguration().setBoolean("mapred.compress.map.output", false); // } job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class); job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); SequenceFileInputFormat.setInputPaths(job, options.getInputPaths()); job.setNumReduceTasks(options.getNumReducers()); if (options.getNumReducers() > 1) { if (options.isDocumentPartitionMode()) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class); } else { // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class); } else { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class); } } } else { // for JUnit tests, we seem to need to restore the original // partitioner class job.setPartitionerClass(HashPartitioner.class); } job.setJarByClass(this.getClass()); return job; } /** * Process the arguments and start the map-reduce indexing. * * @param args * @throws Exception */ @Override public int run(String[] args) throws Exception { final long time = System.currentTimeMillis(); final HadoopIndexerOptions options = new HadoopIndexerOptions(); final CmdLineParser parser = new CmdLineParser(options); try { parser.parseArgument(args); } catch (final CmdLineException e) { parser.printUsage(System.err); logger.fatal(e.getMessage()); logger.fatal(usage()); return 1; } if (Files.exists(options.getOutputPathString()) && Index.existsIndex(options.getOutputPathString(), ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + options.getOutputPathString() + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return 1; } // create job final Job job = createJob(options); // set args string job.getConfiguration().setStrings(INDEXER_ARGS_STRING, args); options.configureFilterMode(job.getConfiguration()); // run job JobID jobId = null; boolean ranOK = true; try { ranOK = job.waitForCompletion(true); jobId = job.getJobID(); } catch (final Exception e) { logger.error("Problem running job", e); ranOK = false; } if (jobId != null) { deleteTaskFiles(options.getOutputPathString(), jobId); } if (ranOK) { if (!options.isDocumentPartitionMode()) { if (job.getNumReduceTasks() > 1) { mergeLexiconInvertedFiles(options.getOutputPathString(), job.getNumReduceTasks()); } } finish(options.getOutputPathString(), options.isDocumentPartitionMode() ? job.getNumReduceTasks() : 1, job.getConfiguration()); } System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); return 0; } public static void main(String[] args) throws Exception { // args = new String[] { // "-t", "BASIC", // "-j", "4", // "-nr", "1", // "-fc", "QuantisedKeypoint", // "-o", "/Users/jsh2/test.index", // "-m", "IMAGES", // "-q", // "hdfs://seurat.ecs.soton.ac.uk/data/codebooks/small-10.seq/final", // "hdfs://seurat.ecs.soton.ac.uk/data/image-net-timetests/image-net-10.seq" // }; ToolRunner.run(new HadoopIndexer(), args); } }