org.hipi.tools.downloader.Downloader.java Source code

Introduction

Here is the source code for org.hipi.tools.downloader.Downloader.java
Source

package org.hipi.tools.downloader;

import org.hipi.image.HipiImageHeader.HipiImageFormat;
import org.hipi.imagebundle.HipiImageBundle;
import org.hipi.image.HipiImageHeader;
import org.hipi.image.io.JpegCodec;
import org.hipi.image.io.PngCodec;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.Parser;
import org.apache.commons.cli.ParseException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A MapReduce program that takes a list of image URL's, downloads
 * them, and creates a {@link org.hipi.imagebundle.HipiImageBundle} from
 * them. Also supports downloading the Yahoo/Flickr 100M CC dataset.
 * 
 * When running this program, the user must specify 3 parameters. The
 * first is the location of the list of URL's (one URL per line), the
 * second is the output path for the HIB that will be generated, and
 * the third is the number of nodes that should be used during the
 * program's execution. This final parameter should be chosen with
 * respect to the total bandwidth your particular cluster is able to
 * handle. An example usage would be: 
 * <p>
 * downloader.jar /path/to/urls.txt /path/to/output.hib 10 
 * <p>
 * This program
 * will automatically force 10 nodes to download the set of URL's
 * contained in the input list, thus if your list contains 100,000
 * images, each node in this example will be responsible for
 * downloading 10,000 images.
 *
 */
public class Downloader extends Configured implements Tool {

    private static final Options options = new Options();
    private static final Parser parser = (Parser) new BasicParser();
    static {
        options.addOption("f", "force", false, "force overwrite if output HIB already exists");
        options.addOption("y", "yfcc100m", false, "assume input files are in Yahoo/Flickr CC 100M format");
        options.addOption("n", "num-nodes", true,
                "number of download nodes (default=1) (ignored if --yfcc100m is specified)");
    }

    private static void usage() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(148);
        formatter.printHelp(
                "hibDownload.jar <directory containing source files> <output HIB> [-f] [--yfcc100m] [--num-nodes #count]",
                options);
        System.exit(0);
    }

    private static long uniqueMapperKey = 0; // Ensures temp hib paths in mapper are unique
    private static long numDownloads = 0; // Keeps track of number of image downloads

    private final String FLICKR_PREFIX = "yfcc100m_dataset"; // This string represents the root name for each of the dataset files

    public static class DownloaderMapper extends Mapper<LongWritable, Text, BooleanWritable, Text> {

        private static Configuration conf;

        @Override
        public void setup(Context context) throws IOException, InterruptedException {
            this.conf = context.getConfiguration();
        }

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // Use line number and a unique key assigned to each map task to generate a unique filename.
            String tempPath = conf.get("downloader.outpath") + key.get() + uniqueMapperKey + ".hib.tmp";

            boolean yfcc100m = conf.getBoolean("downloader.yfcc100m", false);

            // Create new temporary HIB
            HipiImageBundle hib = new HipiImageBundle(new Path(tempPath), conf);
            hib.openForWrite(true);

            // The value argument contains a list of image URLs delimited by
            // '\n'. Setup buffered reader to allow processing this string
            // line by line.
            BufferedReader lineReader = new BufferedReader(new StringReader(value.toString()));
            String line;

            // Iterate through URLs
            while ((line = lineReader.readLine()) != null) {

                String[] lineFields = null;
                String imageUri = null;

                if (yfcc100m) {
                    // Split line into fields
                    lineFields = line.split("\t"); // Fields within each line are delimited by tabs  
                    if (lineFields[22].equals("1")) { // 0 = image, 1 = video in YFCC100M format
                        continue;
                    }
                    imageUri = lineFields[14];
                } else {
                    imageUri = line; // Otherwise, assume entire line is image URL
                }

                long startTime = System.currentTimeMillis();
                try {

                    String type = "";
                    URLConnection conn;

                    // Attempt to download image at URL using java.net
                    try {
                        URL link = new URL(imageUri);
                        numDownloads++;
                        System.out.println("");
                        System.out.println("Downloading: " + link.toString());
                        System.out.println("Number of downloads: " + numDownloads);
                        conn = link.openConnection();
                        conn.connect();
                        type = conn.getContentType();

                        // Check that image format is supported, header is parsable, and add to HIB if so
                        if (type != null
                                && (type.compareTo("image/jpeg") == 0 || type.compareTo("image/png") == 0)) {

                            // Get input stream for URL connection
                            InputStream bis = new BufferedInputStream(conn.getInputStream());

                            // Mark current location in stream for later reset
                            bis.mark(Integer.MAX_VALUE);

                            // Attempt to decode the image header
                            HipiImageHeader header = (type.compareTo("image/jpeg") == 0
                                    ? JpegCodec.getInstance().decodeHeader(bis)
                                    : PngCodec.getInstance().decodeHeader(bis));

                            if (header == null) {
                                System.out.println(
                                        "Failed to parse header, image not added to HIB: " + link.toString());
                            } else {

                                // Passed header decode test, so reset to beginning of stream
                                bis.reset();

                                if (yfcc100m) {
                                    // Capture fields as image metadata for posterity
                                    for (int i = 0; i < lineFields.length; i++) {
                                        header.addMetaData(String.format("col_%03d", i), lineFields[i]);
                                    }
                                    header.addMetaData("source", lineFields[14]);
                                } else {
                                    // Capture source URL as image metadata for posterity
                                    header.addMetaData("source", imageUri);
                                }

                                // Add image to hib
                                hib.addImage(header, bis);

                                System.err.println("Added to HIB: " + imageUri);
                            }
                        } else {
                            System.out.println("Unrecognized HTTP content type or unsupported image format [" + type
                                    + "], not added to HIB: " + imageUri);
                        }
                    } catch (Exception e) {
                        System.out.println("Connection error while trying to download: " + imageUri);
                        e.printStackTrace();
                    }
                } catch (Exception e) {
                    System.out.println("Network error while trying to download: " + imageUri);
                    e.printStackTrace();
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException ie) {
                        ie.printStackTrace();
                    }
                }

                float el = (float) (System.currentTimeMillis() - startTime) / 1000.0f;
                System.out.println("> Time elapsed: " + el + " seconds");

            } // while ((line = lineReader.readLine()) != null) {

            try {
                // Output key/value pair to reduce layer consisting of boolean and path to HIB
                context.write(new BooleanWritable(true), new Text(hib.getPath().toString()));
                // Cleanup
                lineReader.close();
                hib.close();
            } catch (Exception e) {
                e.printStackTrace();
            }

            uniqueMapperKey++;

        }

        // Display metadata of the image
        public static void printFlickrImageMetadata(String[] lineArray) {
            System.out.println("  Flickr Image Metadata: ");
            System.out.println("    > Photo/Video Identifier: " + lineArray[0]);
            System.out.println("    > User NSID: " + lineArray[1]);
            System.out.println("    > User Nickname: " + lineArray[2]);
            System.out.println("    > Date Taken: " + lineArray[3]);
            System.out.println("    > Date Uploaded: " + lineArray[4]);
            System.out.println("    > Capture Device: " + lineArray[5]);
            System.out.println("    > Title: " + lineArray[6]);
            System.out.println("    > Description: " + lineArray[7]);
            System.out.println("    > User Tags: " + lineArray[8]);
            System.out.println("    > Machine Tags: " + lineArray[9]);
            System.out.println("    > Longitude: " + lineArray[10]);
            System.out.println("    > Latitude: " + lineArray[11]);
            System.out.println("    > Accuracy: " + lineArray[12]);
            System.out.println("    > Photo/Video Page URL: " + lineArray[13]);
            System.out.println("    > Photo/Video Download URL: " + lineArray[14]);
            System.out.println("    > License Name: " + lineArray[15]);
            System.out.println("    > License URL: " + lineArray[16]);
            System.out.println("    > Photo/Video Server Identifier: " + lineArray[17]);
            System.out.println("    > Photo/Video Farm Identifier: " + lineArray[18]);
            System.out.println("    > Photo/Video Secret: " + lineArray[19]);
            System.out.println("    > Photo/Video Secret Original: " + lineArray[20]);
            System.out.println("    > Extension of the Original Photo: " + lineArray[21]);
            System.out.println("    > Photos/video marker (0 = photo, 1 = video): " + lineArray[22]);
        }
    }

    public int run(String[] args) throws Exception {

        // try to parse command line arguments
        CommandLine line = null;
        try {
            line = parser.parse(options, args);
        } catch (ParseException exp) {
            usage();
        }
        if (line == null) {
            usage();
        }

        String[] leftArgs = line.getArgs();

        if (leftArgs.length != 2) {
            usage();
        }

        String inputDir = leftArgs[0];
        String outputHib = leftArgs[1];

        boolean yfcc100m = line.hasOption("yfcc100m");
        int numDownloadNodes = (yfcc100m ? 1
                : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1)));
        if (numDownloadNodes < 1) {
            System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]");
            System.exit(1);
        }

        boolean overwrite = line.hasOption("force");

        System.out.println("Source directory: " + inputDir);
        System.out.println("Output HIB: " + outputHib);
        System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false"));
        System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false"));
        System.out.println("Number of download nodes: " + numDownloadNodes);

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        // Remove existing HIB if overwrite is specified and HIB exists
        if (!overwrite) {
            if (fs.exists(new Path(outputHib))) {
                System.err.println(
                        "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite.");
                System.exit(1);
            }
        } else { // overwrite
            if (fs.exists(new Path(outputHib))) {
                System.out.println("Found that output HIB already exists, deleting.");
            }
        }

        fs.delete(new Path(outputHib), true);
        fs.delete(new Path(outputHib + ".dat"), true);
        fs.delete(new Path(outputHib + "_output"), true);

        // Scan source directory for list of input files
        FileStatus[] inputFiles = fs.listStatus(new Path(inputDir));
        if (inputFiles == null || inputFiles.length == 0) {
            System.err.println("Failed to find any files in source directory: " + inputDir);
            System.exit(1);
        }

        // Validate list of input files
        ArrayList<Path> sourceFiles = new ArrayList<Path>();
        for (FileStatus file : inputFiles) {

            Path path = file.getPath();

            if (yfcc100m) {
                String[] tokens = path.getName().split("-");
                if (tokens == null || tokens.length == 0) {
                    System.out.println("  Skipping source file (does not follow YFCC100M file name convention): "
                            + file.getPath());
                    continue;
                }
            }

            try {
                // If it exists, get the relevant compression codec
                CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
                CompressionCodec codec = codecFactory.getCodec(path);

                FSDataInputStream fis = fs.open(path);

                // If the codec was found, use it to create an decompressed input stream.
                // Otherwise, assume input stream is already decompressed
                BufferedReader reader = null;
                if (codec != null) {
                    reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis)));
                } else {
                    reader = new BufferedReader(new InputStreamReader(fis));
                }

                String fileLine = reader.readLine();
                String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+"));

                if (yfcc100m) {
                    if (lineFields.length != 23) {
                        System.out.println("  Skipping source file (does not follow YFCC100M source file format): "
                                + file.getPath());
                        String imageUri = null;
                    } else {
                        System.out.println("  Adding source file: " + file.getPath());
                        sourceFiles.add(path);
                    }
                } else {
                    if (lineFields.length != 1) {
                        System.out.println(
                                "  Skipping source file (contains multiple fields per line where only one is expected): "
                                        + file.getPath());
                        if (lineFields.length == 23) {
                            System.out.println("  Did you mean to use \"--yfcc100m\"?");
                        }
                        String imageUri = null;
                    } else {
                        System.out.println("  Adding source file: " + file.getPath());
                        sourceFiles.add(path);
                    }
                }
                fis.close();
                reader = null;
            } catch (Exception e) {
                System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath());
                continue;
            }

        }

        if (sourceFiles.size() == 0) {
            System.err.println("Failed to find any valid files in source directory: " + inputDir);
            System.exit(1);
        }

        // Construct path to directory containing outputHib
        String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1);

        // Attaching job parameters to global Configuration object
        conf.setInt("downloader.nodes", numDownloadNodes);
        conf.setStrings("downloader.outfile", outputHib);
        conf.setStrings("downloader.outpath", outputPath);
        conf.setBoolean("downloader.yfcc100m", yfcc100m);

        Job job = Job.getInstance(conf, "hibDownload");
        job.setJarByClass(Downloader.class);
        job.setMapperClass(DownloaderMapper.class);
        job.setReducerClass(DownloaderReducer.class);
        job.setInputFormatClass(DownloaderInputFormat.class);
        job.setOutputKeyClass(BooleanWritable.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output"));

        Path[] inputPaths = new Path[sourceFiles.size()];
        inputPaths = sourceFiles.toArray(inputPaths);
        DownloaderInputFormat.setInputPaths(job, inputPaths);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Downloader(), args);
        System.exit(res);
    }
}