Java tutorial
/* * This file is part of the LIRE project: http://www.semanticmetadata.net/lire * LIRE is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * LIRE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with LIRE; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * We kindly ask you to refer the any or one of the following publications in * any publication mentioning or employing Lire: * * Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval * An Extensible Java CBIR Library. In proceedings of the 16th ACM International * Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008 * URL: http://doi.acm.org/10.1145/1459359.1459577 * * Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the * 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale, * Arizona, USA, 2011 * URL: http://dl.acm.org/citation.cfm?id=2072432 * * Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE * Morgan & Claypool, 2013 * URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025 * * Copyright statement: * -------------------- * (c) 2002-2013 by Mathias Lux (mathias@juggle.at) * http://www.semanticmetadata.net/lire, http://www.lire-project.net */ package net.semanticmetadata.lire.solr; import com.jhlabs.image.DespeckleFilter; import net.semanticmetadata.lire.imageanalysis.*; import net.semanticmetadata.lire.indexing.hashing.BitSampling; import net.semanticmetadata.lire.indexing.parallel.WorkItem; import net.semanticmetadata.lire.solr.indexing.ImageDataProcessor; import net.semanticmetadata.lire.utils.ImageUtils; import org.apache.commons.codec.binary.Base64; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.*; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; /** * This indexing application allows for parallel extraction of global features from multiple image files for * use with the LIRE Solr plugin. It basically takes a list of images (ie. created by something like * "dir /s /b > list.txt" or "ls [some parameters] > list.txt". * <p/> * use it like: * <pre>$> java -jar lire-request-handler.jar -i <infile> [-o <outfile>] [-n <threads>] [-m <max_side_length>] [-f]</pre> * <p/> * Available options are: * <ul> * <li>-i <infile> gives a file with a list of images to be indexed, one per line.</li> * <li>-o <outfile> ... gives XML file the output is written to. if none is given the outfile is <infile>.xml</li> * <li>-n <threads> ... gives the number of threads used for extraction. The number of cores is a good value for that.</li> * <li>-m <max-side-length> ... gives a maximum side length for extraction. This option is useful if very larger images are indexed.</li> * <li>-f ... forces to overwrite the <outfile>. If the <outfile> already exists and -f is not given, then the operation is aborted.</li> * <li>-p ... enables image processing before indexing (despeckle, trim white space)</li> * <li>-r ... defines a class implementing net.semanticmetadata.lire.solr.indexing.ImageDataProcessor that provides additional fields.</li> * </ul> * <p/> * TODO: Make feature list change-able * <p/> * You then basically need to enrich the file with whatever metadata you prefer and send it to Solr using for instance curl: * <pre>curl http://localhost:9000/solr/lire/update -H "Content-Type: text/xml" --data-binary @extracted_file.xml * curl http://localhost:9000/solr/lire/update -H "Content-Type: text/xml" --data-binary "<commit/>"</pre> * * @author Mathias Lux, mathias@juggle.at on 13.08.2013 */ public class ParallelSolrIndexer implements Runnable { private final int maxCacheSize = 100; // private static HashMap<Class, String> classToPrefix = new HashMap<Class, String>(5); private boolean force = false; private static boolean individualFiles = false; private static int numberOfThreads = 4; LinkedBlockingQueue<WorkItem> images = new LinkedBlockingQueue<WorkItem>(maxCacheSize); boolean ended = false; int overallCount = 0; OutputStream dos = null; Set<Class> listOfFeatures; File fileList = null; File outFile = null; private int monitoringInterval = 10; private int maxSideLength = 512; private boolean isPreprocessing = true; private Class imageDataProcessor = null; public ParallelSolrIndexer() { // default constructor. listOfFeatures = new HashSet<Class>(); listOfFeatures.add(PHOG.class); listOfFeatures.add(ColorLayout.class); listOfFeatures.add(EdgeHistogram.class); listOfFeatures.add(JCD.class); } /** * Sets the number of consumer threads that are employed for extraction * * @param numberOfThreads */ public static void setNumberOfThreads(int numberOfThreads) { ParallelSolrIndexer.numberOfThreads = numberOfThreads; } public static void main(String[] args) throws IOException { BitSampling.readHashFunctions(); ParallelSolrIndexer e = new ParallelSolrIndexer(); // parse programs args ... for (int i = 0; i < args.length; i++) { String arg = args[i]; if (arg.startsWith("-i")) { // infile ... if ((i + 1) < args.length) e.setFileList(new File(args[i + 1])); else { System.err.println("Could not set out file."); printHelp(); } } else if (arg.startsWith("-o")) { // out file, if it's not set a single file for each input image is created. if ((i + 1) < args.length) e.setOutFile(new File(args[i + 1])); else printHelp(); } else if (arg.startsWith("-m")) { // out file if ((i + 1) < args.length) { try { int s = Integer.parseInt(args[i + 1]); if (s > 10) e.setMaxSideLength(s); } catch (NumberFormatException e1) { e1.printStackTrace(); printHelp(); } } else printHelp(); } else if (arg.startsWith("-r")) { // image data processor class. if ((i + 1) < args.length) { try { Class<?> imageDataProcessorClass = Class.forName(args[i + 1]); if (imageDataProcessorClass.newInstance() instanceof ImageDataProcessor) e.setImageDataProcessor(imageDataProcessorClass); } catch (Exception e1) { System.err.println("Did not find imageProcessor class: " + e1.getMessage()); printHelp(); System.exit(0); } } else printHelp(); } else if (arg.startsWith("-f") || arg.startsWith("--force")) { e.setForce(true); } else if (arg.startsWith("-y") || arg.startsWith("--features")) { if ((i + 1) < args.length) { // parse and check the features. String[] ft = args[i + 1].split(","); for (int j = 0; j < ft.length; j++) { String s = ft[j].trim(); if (FeatureRegistry.getClassForCode(s) != null) { e.addFeature(FeatureRegistry.getClassForCode(s)); } } } } else if (arg.startsWith("-p")) { e.setPreprocessing(true); } else if (arg.startsWith("-h")) { // help printHelp(); System.exit(0); } else if (arg.startsWith("-n")) { if ((i + 1) < args.length) try { ParallelSolrIndexer.numberOfThreads = Integer.parseInt(args[i + 1]); } catch (Exception e1) { System.err.println("Could not set number of threads to \"" + args[i + 1] + "\"."); e1.printStackTrace(); } else printHelp(); } } // check if there is an infile, an outfile and some features to extract. if (!e.isConfigured()) { printHelp(); } else { e.run(); } } private static void printHelp() { System.out.println("This help text is shown if you start the ParallelSolrIndexer with the '-h' option.\n" + "\n" + "$> ParallelSolrIndexer -i <infile> [-o <outfile>] [-n <threads>] [-f] [-p] [-m <max_side_length>] [-r <full class name>] \\\\ \n" + " [-y <list of feature classes>]\n" + "\n" + "Note: if you don't specify an outfile just \".xml\" is appended to the input image for output. So there will be one XML\n" + "file per image. Specifying an outfile will collect the information of all images in one single file.\n" + "\n" + "-n ... number of threads should be something your computer can cope with. default is 4.\n" + "-f ... forces overwrite of outfile\n" + "-p ... enables image processing before indexing (despeckle, trim white space)\n" + "-m ... maximum side length of images when indexed. All bigger files are scaled down. default is 512.\n" + "-r ... defines a class implementing net.semanticmetadata.lire.solr.indexing.ImageDataProcessor\n" + " that provides additional fields.\n" + "-y ... defines which feature classes are to be extracted. default is \"-y ph,cl,eh,jc\". \"-y ce,ac\" would \n" + " add to the other four features. "); } public static String arrayToString(int[] array) { StringBuilder sb = new StringBuilder(array.length * 8); for (int i = 0; i < array.length; i++) { if (i > 0) sb.append(' '); sb.append(Integer.toHexString(array[i])); } return sb.toString(); } /** * Adds a feature to the extractor chain. All those features are extracted from images. * * @param feature */ public void addFeature(Class feature) { listOfFeatures.add(feature); } /** * Sets the file list for processing. One image file per line is fine. * * @param fileList */ public void setFileList(File fileList) { this.fileList = fileList; } /** * Sets the outfile. The outfile has to be in a folder parent to all input images. * * @param outFile */ public void setOutFile(File outFile) { this.outFile = outFile; } public void setImageDataProcessor(Class imageDataProcessor) { this.imageDataProcessor = imageDataProcessor; } public int getMaxSideLength() { return maxSideLength; } public void setMaxSideLength(int maxSideLength) { this.maxSideLength = maxSideLength; } private boolean isConfigured() { boolean configured = true; if (fileList == null || !fileList.exists()) configured = false; else if (outFile == null) { individualFiles = true; // create an outfile ... // try { // outFile = new File(fileList.getCanonicalPath() + ".xml"); // System.out.println("Setting out file to " + outFile.getCanonicalFile()); // } catch (IOException e) { // configured = false; // } } else if (outFile.exists() && !force) { System.err.println(outFile.getName() + " already exists. Please delete or choose another outfile."); configured = false; } return configured; } @Override public void run() { // check: if (fileList == null || !fileList.exists()) { System.err.println("No text file with a list of images given."); return; } System.out.println("Extracting features: "); for (Iterator<Class> iterator = listOfFeatures.iterator(); iterator.hasNext();) { System.out.println("\t" + iterator.next().getCanonicalName()); } try { if (!individualFiles) { // create a BufferedOutputStream with a large buffer dos = new BufferedOutputStream(new FileOutputStream(outFile), 1024 * 1024 * 8); dos.write("<add>\n".getBytes()); } Thread p = new Thread(new Producer(), "Producer"); p.start(); LinkedList<Thread> threads = new LinkedList<Thread>(); long l = System.currentTimeMillis(); for (int i = 0; i < numberOfThreads; i++) { Thread c = new Thread(new Consumer(), "Consumer-" + i); c.start(); threads.add(c); } Thread m = new Thread(new Monitoring(), "Monitoring"); m.start(); for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext();) { iterator.next().join(); } long l1 = System.currentTimeMillis() - l; System.out.println("Analyzed " + overallCount + " images in " + l1 / 1000 + " seconds, ~" + (overallCount > 0 ? (l1 / overallCount) : "inf.") + " ms each."); if (!individualFiles) { dos.write("</add>\n".getBytes()); dos.close(); } // writer.commit(); // writer.close(); // threadFinished = true; } catch (Exception e) { e.printStackTrace(); } } private void addFeatures(List features) { for (Iterator<Class> iterator = listOfFeatures.iterator(); iterator.hasNext();) { Class next = iterator.next(); try { features.add(next.newInstance()); } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } } } public boolean isPreprocessing() { return isPreprocessing; } public void setPreprocessing(boolean isPreprocessing) { this.isPreprocessing = isPreprocessing; } public boolean isForce() { return force; } public void setForce(boolean force) { this.force = force; } class Monitoring implements Runnable { public void run() { long ms = System.currentTimeMillis(); try { Thread.sleep(1000 * monitoringInterval); // wait xx seconds } catch (InterruptedException e) { e.printStackTrace(); } while (!ended) { try { // print the current status: long time = System.currentTimeMillis() - ms; System.out.println("Analyzed " + overallCount + " images in " + time / 1000 + " seconds, " + ((overallCount > 0) ? (time / overallCount) : "n.a.") + " ms each (" + images.size() + " images currently in queue)."); Thread.sleep(1000 * monitoringInterval); // wait xx seconds } catch (InterruptedException e) { e.printStackTrace(); } } } } class Producer implements Runnable { public void run() { try { BufferedReader br = new BufferedReader(new FileReader(fileList)); String file = null; File next = null; while ((file = br.readLine()) != null) { next = new File(file); try { int fileSize = (int) next.length(); byte[] buffer = new byte[fileSize]; FileInputStream fis = new FileInputStream(next); fis.read(buffer); String path = next.getCanonicalPath(); images.put(new WorkItem(path, buffer)); } catch (Exception e) { System.err.println("Could not read image " + file + ": " + e.getMessage()); } } for (int i = 0; i < numberOfThreads * 2; i++) { String tmpString = null; BufferedImage tmpImg = null; try { images.put(new WorkItem(tmpString, tmpImg)); } catch (InterruptedException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } ended = true; } } class Consumer implements Runnable { WorkItem tmp = null; LinkedList<LireFeature> features = new LinkedList<LireFeature>(); int count = 0; boolean locallyEnded = false; StringBuilder sb = new StringBuilder(1024); Consumer() { addFeatures(features); } public void run() { while (!locallyEnded) { try { // we wait for the stack to be either filled or empty & not being filled any more. // make sure the thread locally knows that the end has come (outer loop) // if (images.peek().getBuffer() == null) // locallyEnded = true; // well the last thing we want is an exception in the very last round. if (!locallyEnded) { tmp = images.take(); if (tmp.getBuffer() == null) locallyEnded = true; else { count++; overallCount++; } } if (!locallyEnded) { sb.delete(0, sb.length()); ByteArrayInputStream b = new ByteArrayInputStream(tmp.getBuffer()); // reads the image. Make sure twelve monkeys lib is in the path to read all jpegs and tiffs. BufferedImage read = ImageIO.read(b); // --------< preprocessing >------------------------- // converts color space to INT_RGB BufferedImage img = ImageUtils.createWorkingCopy(read); if (isPreprocessing) { // despeckle DespeckleFilter df = new DespeckleFilter(); img = df.filter(img, null); img = ImageUtils.trimWhiteSpace(img); // trims white space } // --------< / preprocessing >------------------------- if (maxSideLength > 50) img = ImageUtils.scaleImage(img, maxSideLength); // scales image to 512 max sidelength. else if (img.getWidth() < 32 || img.getHeight() < 32) { // image is too small to be worked with, for now I just do an upscale. double scaleFactor = 128d; if (img.getWidth() > img.getHeight()) { scaleFactor = (128d / (double) img.getWidth()); } else { scaleFactor = (128d / (double) img.getHeight()); } img = ImageUtils.scaleImage(img, ((int) (scaleFactor * img.getWidth())), (int) (scaleFactor * img.getHeight())); } ImageDataProcessor idp = null; try { if (imageDataProcessor != null) { idp = (ImageDataProcessor) imageDataProcessor.newInstance(); } } catch (Exception e) { System.err.println("Could not instantiate ImageDataProcessor!"); e.printStackTrace(); } // --------< creating doc >------------------------- sb.append("<doc>"); sb.append("<field name=\"id\">"); if (idp == null) sb.append(tmp.getFileName()); else sb.append(idp.getIdentifier(tmp.getFileName())); sb.append("</field>"); sb.append("<field name=\"title\">"); if (idp == null) sb.append(tmp.getFileName()); else sb.append(idp.getTitle(tmp.getFileName())); sb.append("</field>"); if (idp != null) sb.append(idp.getAdditionalFields(tmp.getFileName())); for (LireFeature feature : features) { String featureCode = FeatureRegistry.getCodeForClass(feature.getClass()); if (featureCode != null) { feature.extract(img); String histogramField = FeatureRegistry.codeToFeatureField(featureCode); String hashesField = FeatureRegistry.codeToHashField(featureCode); sb.append("<field name=\"" + histogramField + "\">"); sb.append(Base64.encodeBase64String(feature.getByteArrayRepresentation())); sb.append("</field>"); sb.append("<field name=\"" + hashesField + "\">"); sb.append(arrayToString(BitSampling.generateHashes(feature.getDoubleHistogram()))); sb.append("</field>"); } } sb.append("</doc>\n"); // --------< / creating doc >------------------------- // finally write everything to the stream - in case no exception was thrown.. if (!individualFiles) { synchronized (dos) { dos.write(sb.toString().getBytes()); // dos.flush(); // flushing takes too long ... better not. } } else { OutputStream mos = new BufferedOutputStream( new FileOutputStream(tmp.getFileName() + "_solr.xml")); mos.write(sb.toString().getBytes()); mos.flush(); mos.close(); } } // if (!individualFiles) { // synchronized (dos) { // dos.write(buffer.toString().getBytes()); // } // } } catch (Exception e) { System.err.println("Error processing file " + tmp.getFileName()); e.printStackTrace(); } } } } }