org.apache.nutch.tools.FileDumper.java Source code

Introduction

Here is the source code for org.apache.nutch.tools.FileDumper.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.tools;

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.ByteArrayInputStream;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import com.google.common.base.Strings;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.codec.digest.DigestUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;

import org.apache.tika.Tika;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.codehaus.jackson.map.ObjectMapper;

/**
 * The file dumper tool enables one to reverse generate the raw content from
 * Nutch segment data directories.
 * <p>
 * The tool has a number of immediate uses:
 * <ol>
 * <li>one can see what a page looked like at the time it was crawled</li>
 * <li>one can see different media types acquired as part of the crawl</li>
 * <li>it enables us to see webpages before we augment them with additional
 * metadata, this can be handy for providing a provenance trail for your crawl
 * data.</li>
 * </ol>
 * <p>
 * Upon successful completion the tool displays a very convenient JSON snippet
 * detailing the mimetype classifications and the counts of documents which fall
 * into those classifications. An example is as follows:
 * 
 * <pre>
 * {@code
 * INFO: File Types: 
 *   TOTAL Stats:    
 *    [
 *     {"mimeType":"application/xml","count":"19"}
 *     {"mimeType":"image/png","count":"47"}
 *     {"mimeType":"image/jpeg","count":"141"}
 *     {"mimeType":"image/vnd.microsoft.icon","count":"4"}
 *     {"mimeType":"text/plain","count":"89"}
 *     {"mimeType":"video/quicktime","count":"2"}
 *     {"mimeType":"image/gif","count":"63"}
 *     {"mimeType":"application/xhtml+xml","count":"1670"}
 *     {"mimeType":"application/octet-stream","count":"40"}
 *     {"mimeType":"text/html","count":"1863"}
 *   ]
 *   
 *   FILTER Stats: 
 *   [
 *     {"mimeType":"image/png","count":"47"}
 *     {"mimeType":"image/jpeg","count":"141"}
 *     {"mimeType":"image/vnd.microsoft.icon","count":"4"}
 *     {"mimeType":"video/quicktime","count":"2"}
 *     {"mimeType":"image/gif","count":"63"}
 *   ]
 * }
 * </pre>
 * <p>
 * In the case above, the tool would have been run with the <b>-mimeType
 * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
 * flag and corresponding values activated.
 * </p>
 */
public class FileDumper {

    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    /**
     * Dumps the reverse engineered raw content from the provided segment
     * directories if a parent directory contains more than one segment, otherwise
     * a single segment can be passed as an argument.
     * 
     * @param outputDir
     *          the directory you wish to dump the raw content to. This directory
     *          will be created.
     * @param segmentRootDir
     *          a directory containing one or more segments.
     * @param mimeTypes
     *          an array of mime types we have to dump, all others will be
     *          filtered out.
     * @param flatDir
     *          a boolean flag specifying whether the output directory should contain
     *          only files instead of using nested directories to prevent naming
     *          conflicts.
     * @param mimeTypeStats
     *          a flag indicating whether mimetype stats should be displayed
     *          instead of dumping files.
     * @throws Exception
     */
    public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir,
            boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
        if (mimeTypes == null)
            LOG.info("Accepting all mimetypes.");
        // total file counts
        Map<String, Integer> typeCounts = new HashMap<>();
        // filtered file counts
        Map<String, Integer> filteredCounts = new HashMap<>();
        Configuration conf = NutchConfiguration.create();
        int fileCount = 0;
        File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
        if (segmentDirs == null) {
            LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
            return;
        }

        for (File segment : segmentDirs) {
            LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
            DataOutputStream doutputStream = null;
            Map<String, String> filenameToUrl = new HashMap<String, String>();

            File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
            File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());

            if (partDirs == null) {
                LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
                continue;
            }

            for (File partDir : partDirs) {
                try (FileSystem fs = FileSystem.get(conf)) {
                    String segmentPath = partDir + "/data";
                    Path file = new Path(segmentPath);
                    if (!new File(file.toString()).exists()) {
                        LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
                        continue;
                    }

                    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));

                    Writable key = (Writable) reader.getKeyClass().newInstance();
                    Content content = null;

                    while (reader.next(key)) {
                        content = new Content();
                        reader.getCurrentValue(content);
                        String url = key.toString();
                        String baseName = FilenameUtils.getBaseName(url);
                        String extension = FilenameUtils.getExtension(url);
                        if (extension == null || (extension != null && extension.equals(""))) {
                            extension = "html";
                        }

                        ByteArrayInputStream bas = null;
                        Boolean filter = false;
                        try {
                            bas = new ByteArrayInputStream(content.getContent());
                            String mimeType = new Tika().detect(content.getContent());
                            collectStats(typeCounts, mimeType);
                            if (mimeType != null) {
                                if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
                                    collectStats(filteredCounts, mimeType);
                                    filter = true;
                                }
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            LOG.warn("Tika is unable to detect type for: [" + url + "]");
                        } finally {
                            if (bas != null) {
                                try {
                                    bas.close();
                                } catch (Exception ignore) {
                                }
                            }
                        }

                        if (filter) {
                            if (!mimeTypeStats) {
                                String md5Ofurl = DumpFileUtil.getUrlMD5(url);

                                String fullDir = outputDir.getAbsolutePath();
                                if (!flatDir && !reverseURLDump) {
                                    fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
                                }

                                if (!Strings.isNullOrEmpty(fullDir)) {
                                    String outputFullPath;

                                    if (reverseURLDump) {
                                        String[] reversedURL = TableUtil.reverseUrl(url).split(":");
                                        reversedURL[0] = reversedURL[0].replace('.', '/');

                                        String reversedURLPath = reversedURL[0] + "/"
                                                + DigestUtils.sha256Hex(url).toUpperCase();
                                        outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);

                                        // We'll drop the trailing file name and create the nested structure if it doesn't already exist.
                                        String[] splitPath = outputFullPath.split("/");
                                        File fullOutputDir = new File(org.apache.commons.lang3.StringUtils
                                                .join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));

                                        if (!fullOutputDir.exists()) {
                                            fullOutputDir.mkdirs();
                                        }
                                    } else {
                                        outputFullPath = String.format("%s/%s", fullDir,
                                                DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
                                    }
                                    filenameToUrl.put(outputFullPath, url);
                                    File outputFile = new File(outputFullPath);

                                    if (!outputFile.exists()) {
                                        LOG.info("Writing: [" + outputFullPath + "]");

                                        // Modified to prevent FileNotFoundException (Invalid Argument)
                                        FileOutputStream output = null;
                                        try {
                                            output = new FileOutputStream(outputFile);
                                            IOUtils.write(content.getContent(), output);
                                        } catch (Exception e) {
                                            LOG.warn("Write Error: [" + outputFullPath + "]");
                                            e.printStackTrace();
                                        } finally {
                                            if (output != null) {
                                                output.flush();
                                                try {
                                                    output.close();
                                                } catch (Exception ignore) {
                                                }
                                            }
                                        }
                                        fileCount++;
                                    } else {
                                        LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                                    }
                                }
                            }
                        }
                    }
                    reader.close();
                } finally {
                    if (doutputStream != null) {
                        try {
                            doutputStream.close();
                        } catch (Exception ignore) {
                        }
                    }
                }
            }
            //save filenameToUrl in a json file for each segment there is one mapping file 
            String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(),
                    segment.getName());
            new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);

        }
        LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));

        if (mimeTypeStats) {
            System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
        }
    }

    /**
     * Main method for invoking this tool
     * 
     * @param args
     *          1) output directory (which will be created) to host the raw data
     *          and 2) a directory containing one or more segments.
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        // boolean options
        Option helpOpt = new Option("h", "help", false, "show this help message");
        // argument options
        @SuppressWarnings("static-access")
        Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg()
                .withDescription("output directory (which will be created) to host the raw data")
                .create("outputDir");
        @SuppressWarnings("static-access")
        Option segOpt = OptionBuilder.withArgName("segment").hasArgs().withDescription("the segment(s) to use")
                .create("segment");
        @SuppressWarnings("static-access")
        Option mimeOpt = OptionBuilder.withArgName("mimetype").hasArgs()
                .withDescription("an optional list of mimetypes to dump, excluding all others. Defaults to all.")
                .create("mimetype");
        @SuppressWarnings("static-access")
        Option mimeStat = OptionBuilder.withArgName("mimeStats")
                .withDescription("only display mimetype stats for the segment(s) instead of dumping file.")
                .create("mimeStats");
        @SuppressWarnings("static-access")
        Option dirStructureOpt = OptionBuilder.withArgName("flatdir")
                .withDescription("optionally specify that the output directory should only contain files.")
                .create("flatdir");
        @SuppressWarnings("static-access")
        Option reverseURLOutput = OptionBuilder.withArgName("reverseUrlDirs")
                .withDescription("optionally specify to use reverse URL folders for output structure.")
                .create("reverseUrlDirs");

        // create the options
        Options options = new Options();
        options.addOption(helpOpt);
        options.addOption(outputOpt);
        options.addOption(segOpt);
        options.addOption(mimeOpt);
        options.addOption(mimeStat);
        options.addOption(dirStructureOpt);
        options.addOption(reverseURLOutput);

        CommandLineParser parser = new GnuParser();
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("outputDir") || (!line.hasOption("segment"))) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("FileDumper", options, true);
                return;
            }

            File outputDir = new File(line.getOptionValue("outputDir"));
            File segmentRootDir = new File(line.getOptionValue("segment"));
            String[] mimeTypes = line.getOptionValues("mimetype");
            boolean flatDir = line.hasOption("flatdir");
            boolean shouldDisplayStats = false;
            if (line.hasOption("mimeStats"))
                shouldDisplayStats = true;
            boolean reverseURLDump = false;
            if (line.hasOption("reverseUrlDirs"))
                reverseURLDump = true;

            if (!outputDir.exists()) {
                LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
                if (!shouldDisplayStats) {
                    if (!outputDir.mkdirs())
                        throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
                }
            }

            FileDumper dumper = new FileDumper();
            dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats, reverseURLDump);
        } catch (Exception e) {
            LOG.error("FileDumper: " + StringUtils.stringifyException(e));
            e.printStackTrace();
            return;
        }
    }

    private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
        typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
    }

}