fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.java Source code

Introduction

Here is the source code for fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.java
Source

/*
 * Licensed to David Pilato (the "Author") under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Author licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package fr.pilato.elasticsearch.crawler.fs.framework;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.CopyOption;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.BasicFileAttributeView;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileOwnerAttributeView;
import java.nio.file.attribute.PosixFileAttributeView;
import java.nio.file.attribute.PosixFileAttributes;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TimeZone;

public class FsCrawlerUtil {
    public static final String INDEX_SUFFIX_FOLDER = "_folder";
    public static final String INDEX_SETTINGS_FILE = "_settings";
    public static final String INDEX_SETTINGS_FOLDER_FILE = "_settings_folder";

    private static final Logger logger = LogManager.getLogger(FsCrawlerUtil.class);

    /**
     * Reads a mapping from config/_default/version/type.json file
     *
     * @param config Root dir where we can find the configuration (default to ~/.fscrawler)
     * @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
     * @param type The expected type (will be expanded to type.json)
     * @return the mapping
     * @throws URISyntaxException If URI is malformed
     * @throws IOException If the mapping can not be read
     */
    public static String readDefaultJsonVersionedFile(Path config, String version, String type)
            throws URISyntaxException, IOException {
        Path defaultConfigDir = config.resolve("_default");
        try {
            return readJsonVersionedFile(defaultConfigDir, version, type);
        } catch (NoSuchFileException e) {
            throw new IllegalArgumentException(
                    "Mapping file " + type + ".json does not exist for elasticsearch version " + version + " in ["
                            + defaultConfigDir + "] dir");
        }
    }

    /**
     * Reads a mapping from dir/version/type.json file
     *
     * @param dir Directory containing mapping files per major version
     * @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
     * @param type The expected type (will be expanded to type.json)
     * @return the mapping
     * @throws IOException If the mapping can not be read
     */
    private static String readJsonVersionedFile(Path dir, String version, String type) throws IOException {
        Path file = dir.resolve(version).resolve(type + ".json");
        return new String(Files.readAllBytes(file), "UTF-8");
    }

    /**
     * Reads a Json file from dir/version/filename.json file.
     * If not found, read from ~/.fscrawler/_default/version/filename.json
     *
     * @param dir Directory which might contain filename files per major version (job dir)
     * @param config Root dir where we can find the configuration (default to ~/.fscrawler)
     * @param version Elasticsearch major version number (only major digit is kept so for 2.3.4 it will be 2)
     * @param filename The expected filename (will be expanded to filename.json)
     * @return the mapping
     * @throws URISyntaxException If URI is malformed
     * @throws IOException If the mapping can not be read
     */
    public static String readJsonFile(Path dir, Path config, String version, String filename)
            throws URISyntaxException, IOException {
        try {
            return readJsonVersionedFile(dir, version, filename);
        } catch (NoSuchFileException e) {
            // We fall back to default mappings in config dir
            return readDefaultJsonVersionedFile(config, version, filename);
        }
    }

    /**
     * We check if we can index the file or if we should ignore it
     *
     * @param filename The filename to scan
     * @param includes include rules, may be empty not null
     * @param excludes exclude rules, may be empty not null
     */
    public static boolean isIndexable(String filename, List<String> includes, List<String> excludes) {
        logger.debug("filename = [{}], includes = [{}], excludes = [{}]", filename, includes, excludes);

        boolean excluded = isExcluded(filename, excludes);
        if (excluded)
            return false;

        return isIncluded(filename, includes);
    }

    /**
     * We check if we can index the file or if we should ignore it
     *
     * @param filename The filename to scan
     * @param excludes exclude rules, may be empty not null
     */
    public static boolean isExcluded(String filename, List<String> excludes) {
        logger.debug("filename = [{}], excludes = [{}]", filename, excludes);

        // No rules ? Fine, we index everything
        if (excludes == null || excludes.isEmpty()) {
            logger.trace("no rules");
            return false;
        }

        // Exclude rules : we know that whatever includes rules are, we should exclude matching files
        for (String exclude : excludes) {
            String regex = exclude.toLowerCase().replace("?", ".?").replace("*", ".*?");
            logger.trace("regex is [{}]", regex);
            if (filename.toLowerCase().matches(regex)) {
                logger.trace("does match exclude regex");
                return true;
            }
        }

        logger.trace("does not match any exclude pattern");
        return false;
    }

    /**
     * We check if we can index the file or if we should ignore it
     *
     * @param filename The filename to scan
     * @param includes include rules, may be empty not null
     */
    public static boolean isIncluded(String filename, List<String> includes) {
        logger.debug("filename = [{}], includes = [{}]", filename, includes);

        // No rules ? Fine, we index everything
        if (includes == null || includes.isEmpty()) {
            logger.trace("no include rules");
            return true;
        }

        for (String include : includes) {
            String regex = include.toLowerCase().replace("?", ".?").replace("*", ".*?");
            logger.trace("regex is [{}]", regex);
            if (filename.toLowerCase().matches(regex)) {
                logger.trace("does match include regex");
                return true;
            }
        }

        logger.trace("does not match any include pattern");
        return false;
    }

    public static String computeVirtualPathName(String rootPath, String realPath) {
        String result = "/";
        if (realPath != null && realPath.length() > rootPath.length()) {
            result = realPath.substring(rootPath.length()).replace("\\", "/");
        }

        logger.debug("computeVirtualPathName({}, {}) = {}", rootPath, realPath, result);
        return result;
    }

    public static LocalDateTime getCreationTime(File file) {
        LocalDateTime time;
        try {
            Path path = Paths.get(file.getAbsolutePath());
            BasicFileAttributes fileattr = Files.getFileAttributeView(path, BasicFileAttributeView.class)
                    .readAttributes();
            time = LocalDateTime.ofInstant(fileattr.creationTime().toInstant(), ZoneId.systemDefault());
        } catch (Exception e) {
            time = null;
        }
        return time;
    }

    public static Date localDateTimeToDate(LocalDateTime ldt) {
        return Date.from(ldt.atZone(TimeZone.getDefault().toZoneId()).toInstant());
    }

    public static Date localDateTimeToDate(String sDate) {
        if (sDate == null) {
            return null;
        }
        return localDateTimeToDate(LocalDateTime.parse(sDate, DateTimeFormatter.ISO_DATE_TIME));
    }

    public static String getFileExtension(File file) {
        return FilenameUtils.getExtension(file.getAbsolutePath()).toLowerCase();
    }

    /**
     * Determines the 'owner' of the file.
     */
    public static String getOwnerName(final File file) {
        try {
            final Path path = Paths.get(file.getAbsolutePath());
            final FileOwnerAttributeView ownerAttributeView = Files.getFileAttributeView(path,
                    FileOwnerAttributeView.class);
            return ownerAttributeView != null ? ownerAttributeView.getOwner().getName() : null;
        } catch (Exception e) {
            logger.warn("Failed to determine 'owner' of {}: {}", file, e.getMessage());
            return null;
        }
    }

    /**
     * Determines the 'group' of the file. Please note that 'group' is not
     * available of Windows OS.
     */
    public static String getGroupName(final File file) {
        if (OsValidator.windows) {
            logger.trace("Determining 'group' is skipped for file [{}]on [{}]", file, OsValidator.OS);
            return null;
        }
        try {
            final Path path = Paths.get(file.getAbsolutePath());
            final PosixFileAttributes posixFileAttributes = Files
                    .getFileAttributeView(path, PosixFileAttributeView.class).readAttributes();
            return posixFileAttributes != null ? posixFileAttributes.group().getName() : null;
        } catch (Exception e) {
            logger.warn("Failed to determine 'group' of {}: {}", file, e.getMessage());
            return null;
        }
    }

    private static final String CLASSPATH_RESOURCES_ROOT = "/fr/pilato/elasticsearch/crawler/fs/_default/";
    public static final String[] MAPPING_RESOURCES = { "2/_settings.json", "2/_settings_folder.json",
            "5/_settings.json", "5/_settings_folder.json", "6/_settings.json", "6/_settings_folder.json" };

    /**
     * Copy default resources files which are available as project resources under
     * fr.pilato.elasticsearch.crawler.fs._default package to a given configuration path
     * under a _default sub directory.
     * @param configPath The config path which is by default .fscrawler
     * @throws IOException If copying does not work
     */
    public static void copyDefaultResources(Path configPath) throws IOException, URISyntaxException {
        Path targetResourceDir = configPath.resolve("_default");

        for (String filename : MAPPING_RESOURCES) {
            Path target = targetResourceDir.resolve(filename);
            if (Files.exists(target)) {
                logger.debug("Mapping [{}] already exists", filename);
            } else {
                logger.debug("Copying [{}]...", filename);
                copyResourceFile(CLASSPATH_RESOURCES_ROOT + filename, target);
            }
        }
    }

    /**
     * Move a Legacy resource to another destination if needed
     * @param legacy Legacy file
     * @param destinationFile New file name (directory must exist)
     * @throws IOException In case moving file did not work
     */
    public static void moveLegacyResource(Path legacy, Path destinationFile) throws IOException {
        if (Files.exists(legacy)) {
            logger.debug("Found a legacy file at [{}]", legacy);
            Files.move(legacy, destinationFile);
            logger.info("Moved Legacy file from [{}] to [{}]", legacy, destinationFile);
        }
    }

    /**
     * Copy a single resource file from the classpath or from a JAR.
     * @param target The target
     * @throws IOException If copying does not work
     */
    public static void copyResourceFile(String source, Path target) throws IOException {
        InputStream resource = FsCrawlerUtil.class.getResourceAsStream(source);
        FileUtils.copyInputStreamToFile(resource, target.toFile());
    }

    /**
     * Copy files from a source to a target
     * under a _default sub directory.
     * @param source The source dir
     * @param target The target dir
     * @param options Potential options
     * @throws IOException If copying does not work
     */
    public static void copyDirs(Path source, Path target, CopyOption... options) throws IOException {
        if (Files.notExists(target)) {
            Files.createDirectory(target);
        }

        logger.debug("  --> Copying resources from [{}]", source);
        if (Files.notExists(source)) {
            throw new RuntimeException(source + " doesn't seem to exist.");
        }

        Files.walkFileTree(source, EnumSet.of(FileVisitOption.FOLLOW_LINKS), Integer.MAX_VALUE,
                new InternalFileVisitor(source, target, options));

        logger.debug("  --> Resources ready in [{}]", target);
    }

    private static class InternalFileVisitor extends SimpleFileVisitor<Path> {

        private final Path fromPath;
        private final Path toPath;
        private final CopyOption[] copyOption;

        public InternalFileVisitor(Path fromPath, Path toPath, CopyOption... copyOption) {
            this.fromPath = fromPath;
            this.toPath = toPath;
            this.copyOption = copyOption;
        }

        @Override
        public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {

            Path targetPath = toPath.resolve(fromPath.relativize(dir));
            if (!Files.exists(targetPath)) {
                Files.createDirectory(targetPath);
            }
            return FileVisitResult.CONTINUE;
        }

        @Override
        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
            try {
                Files.copy(file, toPath.resolve(fromPath.relativize(file)), copyOption);
            } catch (FileAlreadyExistsException ignored) {
                // The file already exists we just ignore it
            }
            return FileVisitResult.CONTINUE;
        }
    }

    /**
     * Unzip a jar file
     * @param jarFile Jar file url like file:/path/to/foo.jar
     * @param destination Directory where we want to extract the content to
     * @throws IOException In case of any IO problem
     */
    public static void unzip(String jarFile, Path destination) throws IOException {
        Map<String, String> zipProperties = new HashMap<>();
        /* We want to read an existing ZIP File, so we set this to false */
        zipProperties.put("create", "false");
        zipProperties.put("encoding", "UTF-8");
        URI zipFile = URI.create("jar:" + jarFile);

        try (FileSystem zipfs = FileSystems.newFileSystem(zipFile, zipProperties)) {
            Path rootPath = zipfs.getPath("/");
            Files.walkFileTree(rootPath, new SimpleFileVisitor<Path>() {
                @Override
                public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
                    Path targetPath = destination.resolve(rootPath.relativize(dir).toString());
                    if (!Files.exists(targetPath)) {
                        Files.createDirectory(targetPath);
                    }
                    return FileVisitResult.CONTINUE;
                }

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    Files.copy(file, destination.resolve(rootPath.relativize(file).toString()),
                            StandardCopyOption.COPY_ATTRIBUTES, StandardCopyOption.REPLACE_EXISTING);
                    return FileVisitResult.CONTINUE;
                }
            });
        }
    }

    public static String buildUrl(String scheme, String host, int port) {
        return scheme + "://" + host + ":" + port;
    }

    public static boolean isNullOrEmpty(String string) {
        return string == null || string.isEmpty();
    }
}