org.apache.nutch.util.DumpFileUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.util.DumpFileUtil.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.util;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.MD5Hash;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;

public class DumpFileUtil {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    private final static String DIR_PATTERN = "%s/%s/%s";
    private final static String FILENAME_PATTERN = "%s_%s.%s";
    private final static Integer MAX_LENGTH_OF_FILENAME = 32;
    private final static Integer MAX_LENGTH_OF_EXTENSION = 5;

    public static String getUrlMD5(String url) {
        byte[] digest = MD5Hash.digest(url).getDigest();

        StringBuffer sb = new StringBuffer();
        for (byte b : digest) {
            sb.append(String.format("%02x", b & 0xff));
        }

        return sb.toString();
    }

    public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) {
        String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
        String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();

        String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);

        if (makeDir) {
            try {
                FileUtils.forceMkdir(new File(fullDirPath));
            } catch (IOException e) {
                LOG.error("Failed to create dir: {}", fullDirPath);
                fullDirPath = null;
            }
        }

        return fullDirPath;
    }

    public static String createTwoLevelsDirectory(String basePath, String md5) {
        return createTwoLevelsDirectory(basePath, md5, true);
    }

    public static String createFileName(String md5, String fileBaseName, String fileExtension) {
        if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {
            LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME);
            fileBaseName = StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME);
        }

        if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
            LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
            fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
        }

        // Added to prevent FileNotFoundException (Invalid Argument) - in *nix environment
        fileBaseName = fileBaseName.replaceAll("\\?", "");
        fileExtension = fileExtension.replaceAll("\\?", "");

        return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension);
    }

    public static String createFileNameFromUrl(String basePath, String reverseKey, String urlString,
            String epochScrapeTime, String fileExtension, boolean makeDir) {
        String fullDirPath = basePath + File.separator + reverseKey + File.separator
                + DigestUtils.sha1Hex(urlString);

        if (makeDir) {
            try {
                FileUtils.forceMkdir(new File(fullDirPath));
            } catch (IOException e) {
                LOG.error("Failed to create dir: {}", fullDirPath);
                fullDirPath = null;
            }
        }

        if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
            LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
            fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
        }

        String outputFullPath = fullDirPath + File.separator + epochScrapeTime + "." + fileExtension;

        return outputFullPath;
    }

    public static String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
        StringBuilder builder = new StringBuilder();
        // print total stats
        builder.append("\nTOTAL Stats:\n");
        builder.append("[\n");
        int mimetypeCount = 0;
        for (String mimeType : typeCounts.keySet()) {
            builder.append("    {\"mimeType\":\"");
            builder.append(mimeType);
            builder.append("\",\"count\":\"");
            builder.append(typeCounts.get(mimeType));
            builder.append("\"}\n");
            mimetypeCount += typeCounts.get(mimeType);
        }
        builder.append("]\n");
        builder.append("Total count: " + mimetypeCount + "\n");
        // filtered types stats
        mimetypeCount = 0;
        if (!filteredCounts.isEmpty()) {
            builder.append("\nFILTERED Stats:\n");
            builder.append("[\n");
            for (String mimeType : filteredCounts.keySet()) {
                builder.append("    {\"mimeType\":\"");
                builder.append(mimeType);
                builder.append("\",\"count\":\"");
                builder.append(filteredCounts.get(mimeType));
                builder.append("\"}\n");
                mimetypeCount += filteredCounts.get(mimeType);
            }
            builder.append("]\n");
            builder.append("Total filtered count: " + mimetypeCount + "\n");
        }
        return builder.toString();
    }
}