edu.umn.cs.spatialHadoop.util.TemporalIndexManager.java Source code

Java tutorial

Introduction

Here is the source code for edu.umn.cs.spatialHadoop.util.TemporalIndexManager.java

Source

/***********************************************************************
* Copyright (c) 2015 by Regents of the University of Minnesota.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Apache License, Version 2.0 which 
* accompanies this distribution and is available at
* http://www.opensource.org/licenses/apache2.0.php.
*
*************************************************************************/
package edu.umn.cs.spatialHadoop.util;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.util.GenericOptionsParser;

import edu.umn.cs.spatialHadoop.OperationsParams;

/**
 * Temporal index manager that can determine which files need to be
 * indexed/reindexed on the daily, monthly, and yearly levels.
 * 
 * @author ibrahimsabek
 *
 */

public class TemporalIndexManager {

    /** Logger */
    private static final Log LOG = LogFactory.getLog(TemporalIndexManager.class);

    private SimpleDateFormat dayFormat;
    private SimpleDateFormat monthFormat;
    private SimpleDateFormat yearFormat;

    private Path datasetPath;
    private Path indexesPath;
    private FileSystem fileSystem;

    private Path dailyIndexesHomePath;
    private Path monthlyIndexesHomePath;
    private Path yearlyIndexesHomePath;

    private HashMap<String, Boolean> existDailyIndexes;
    private HashMap<String, Boolean> existMonthlyIndexes;
    private HashMap<String, Boolean> existYearlyIndexes;

    private Path[] neededDailyIndexes;
    private Path[] neededMonthlyIndexes;
    private Path[] neededYearlyIndexes;

    public TemporalIndexManager(Path datasetPath, Path indexesPath) throws ParseException {
        try {
            this.dayFormat = new SimpleDateFormat("yyyy.MM.dd");
            this.monthFormat = new SimpleDateFormat("yyyy.MM");
            this.yearFormat = new SimpleDateFormat("yyyy");

            this.datasetPath = datasetPath;
            this.indexesPath = indexesPath;

            this.fileSystem = this.indexesPath.getFileSystem(new Configuration());

            dailyIndexesHomePath = new Path(this.indexesPath.toString() + "/daily");
            monthlyIndexesHomePath = new Path(this.indexesPath.toString() + "/monthly");
            yearlyIndexesHomePath = new Path(this.indexesPath.toString() + "/yearly");

            initializeIndexesHierarchy();

            existDailyIndexes = new HashMap<String, Boolean>();
            existMonthlyIndexes = new HashMap<String, Boolean>();
            existYearlyIndexes = new HashMap<String, Boolean>();

            loadExistIndexesDictionary();
        } catch (IOException e) {
            LOG.error("Failed to initialize TemporalIndexManager: " + e.getMessage());
            e.printStackTrace();
        }
    }

    /**
     * Creates folder hierarchy for indexes if not exist
     * 
     * @throws IOException
     */
    private void initializeIndexesHierarchy() throws IOException {
        // check daily folder
        if (!this.fileSystem.exists(dailyIndexesHomePath)) {
            this.fileSystem.mkdirs(dailyIndexesHomePath);
        }

        // check monthly folder
        if (!this.fileSystem.exists(monthlyIndexesHomePath)) {
            this.fileSystem.mkdirs(monthlyIndexesHomePath);
        }

        // check yearly folder
        if (!this.fileSystem.exists(yearlyIndexesHomePath)) {
            this.fileSystem.mkdirs(yearlyIndexesHomePath);
        }
    }

    /**
     * Based on a certain time range, this method filters all directories and
    * determines which files need to be indexed on daily, monthly and yearly
    * levels. After calling this method, you need to call the daily, monthly
    * and yearly getters to return paths required to be indexed.
     * @param timeRange
     * @throws IOException
     * @throws ParseException
     */
    public void prepareNeededIndexes(String timeRange) throws IOException, ParseException {
        if (timeRange == null) {
            LOG.error("TimeRange is empty");
            return;
        }

        // Parse start and end dates
        final Date startDate, endDate;
        try {
            startDate = dayFormat.parse(timeRange.split("\\.\\.")[0]);
            endDate = dayFormat.parse(timeRange.split("\\.\\.")[1]);
        } catch (ArrayIndexOutOfBoundsException e) {
            LOG.error("Use the seperator two periods '..' to seperate from and to dates");
            return;
        } catch (ParseException e) {
            LOG.error("Illegal date format in " + timeRange);
            return;
        }

        // Filter all file/folder paths based on the start-end date range
        FileStatus[] matchingDirs = fileSystem.listStatus(datasetPath, new PathFilter() {
            @Override
            public boolean accept(Path p) {
                String dirName = p.getName();
                try {
                    Date date = dayFormat.parse(dirName);
                    return date.compareTo(startDate) >= 0 && date.compareTo(endDate) <= 0;
                } catch (ParseException e) {
                    LOG.warn("Cannot parse directory name: " + dirName);
                    return false;
                }
            }
        });
        if (matchingDirs.length == 0) {
            LOG.warn("No matching directories for the given input");
        }

        // Re-indexing check for each matching
        for (FileStatus matchingDir : matchingDirs) {
            String matchingDirDateString = NASADatasetUtil.extractDateStringFromFileStatus(matchingDir);
            if (existYearlyIndexes.containsKey(NASADatasetUtil.getYearFormat(matchingDirDateString))) {
                // needs to re-build year, month and year indexes
                existYearlyIndexes.put(NASADatasetUtil.getYearFormat(matchingDirDateString), true);
                existMonthlyIndexes.put(NASADatasetUtil.getMonthFormat(matchingDirDateString), true);
                existDailyIndexes.put(NASADatasetUtil.getDayFormat(matchingDirDateString), true);
            } else if (existMonthlyIndexes.containsKey(NASADatasetUtil.getMonthFormat(matchingDirDateString))) {
                // needs to re-build month and day indexes
                existMonthlyIndexes.put(NASADatasetUtil.getMonthFormat(matchingDirDateString), true);
                existDailyIndexes.put(NASADatasetUtil.getDayFormat(matchingDirDateString), true);
            } else if (existDailyIndexes.containsKey(NASADatasetUtil.getDayFormat(matchingDirDateString))) {
                // needs to re-build day index
                existDailyIndexes.put(NASADatasetUtil.getDayFormat(matchingDirDateString), true);
            } else {
                // needs to build a new index
                existDailyIndexes.put(NASADatasetUtil.getDayFormat(matchingDirDateString), true);

                int daysCountInMonth = getMatchesCountFromMap(existDailyIndexes,
                        NASADatasetUtil.getMonthFormat(matchingDirDateString));

                if (daysCountInMonth >= getNumDaysPerMonth(
                        NASADatasetUtil.extractMonthFromDate(matchingDirDateString))) {
                    existMonthlyIndexes.put(NASADatasetUtil.getMonthFormat(matchingDirDateString), true);

                    int monthsCountInYear = getMatchesCountFromMap(existMonthlyIndexes,
                            NASADatasetUtil.getYearFormat(matchingDirDateString));
                    if (monthsCountInYear >= getNumMonthsPerYear()) {
                        existYearlyIndexes.put(NASADatasetUtil.getYearFormat(matchingDirDateString), true);
                    }
                }
            }

        }
        convertNeededIndexesListIntoArrays();
    }

    private void convertNeededIndexesListIntoArrays() {
        neededDailyIndexes = convertFromMapToArray(existDailyIndexes, dailyIndexesHomePath);
        neededMonthlyIndexes = convertFromMapToArray(existMonthlyIndexes, monthlyIndexesHomePath);
        neededYearlyIndexes = convertFromMapToArray(existYearlyIndexes, yearlyIndexesHomePath);
    }

    private Path[] convertFromMapToArray(HashMap<String, Boolean> pathsMap, Path homePath) {
        ArrayList<Path> pathsArrayList = new ArrayList<Path>();
        int count = 0;
        for (String pathsMapKey : pathsMap.keySet()) {
            boolean pathsMapValue = pathsMap.get(pathsMapKey);
            if (pathsMapValue) {
                pathsArrayList.add(new Path(homePath.toString() + "/" + pathsMapKey));
                count++;
            }
        }

        Path[] pathsArr = new Path[count];
        for (int i = 0; i < count; i++) {
            pathsArr[i] = pathsArrayList.get(i);
        }
        return pathsArr;
    }

    private int getMatchesCountFromMap(HashMap<String, Boolean> pathsMap, String matchingString) {
        int matchesCount = 0;
        for (String pathsMapKey : pathsMap.keySet()) {
            if (pathsMapKey.contains(matchingString)) {
                matchesCount++;
            }
        }
        return matchesCount;
    }

    @SuppressWarnings("unused")
    private int getMatchingCountFromNeededIndexes(ArrayList<Path> neededIndexesList, String inputDateString) {
        int count = 0;
        for (Path currPath : neededIndexesList) {
            String currPathString = currPath.toString();
            int start = currPathString.lastIndexOf("/") + 1;
            int end = currPathString.length();
            String currDateString = currPathString.substring(start, end);
            if (currDateString.contains(inputDateString))
                count++;
        }
        return count;
    }

    /**
     * Loads information about exist indexes on all levels: daily, monthly and
     * yearly
     * 
     * @throws IOException
     * @throws ParseException
     */
    private void loadExistIndexesDictionary() throws IOException, ParseException {
        // load daily indexes
        FileStatus[] dailyIndexes = fileSystem.listStatus(dailyIndexesHomePath);
        for (FileStatus dailyIndex : dailyIndexes) {
            if (dailyIndex.isDir()) {
                existDailyIndexes.put(NASADatasetUtil.extractDateStringFromFileStatus(dailyIndex), false);
            }
        }

        // load monthly indexes
        FileStatus[] monthlyIndexes = fileSystem.listStatus(monthlyIndexesHomePath);
        for (FileStatus monthlyIndex : monthlyIndexes) {
            if (monthlyIndex.isDir()) {
                existMonthlyIndexes.put(NASADatasetUtil.extractDateStringFromFileStatus(monthlyIndex), false);
            }
        }

        // load yearly indexes
        FileStatus[] yearlyIndexes = fileSystem.listStatus(yearlyIndexesHomePath);
        for (FileStatus yearlyIndex : yearlyIndexes) {
            if (yearlyIndex.isDir()) {
                existYearlyIndexes.put(NASADatasetUtil.extractDateStringFromFileStatus(yearlyIndex), false);
            }
        }

    }

    @SuppressWarnings("unused")
    private Date getYearDate(String fullDateString) {
        String yearDateString = NASADatasetUtil.getYearFormat(fullDateString);
        try {
            return yearFormat.parse(yearDateString);
        } catch (ParseException e) {
            LOG.error("Date Parsing Error");
            return null;
        }
    }

    @SuppressWarnings("unused")
    private Date getMonthDate(String fullDateString) {
        String monthDateString = NASADatasetUtil.getMonthFormat(fullDateString);
        try {
            return monthFormat.parse(monthDateString);
        } catch (ParseException e) {
            LOG.error("Date Parsing Error");
            return null;
        }
    }

    @SuppressWarnings("unused")
    private Date getDayDate(String fullDateString) {
        String dayDateString = NASADatasetUtil.getDayFormat(fullDateString);
        try {
            return dayFormat.parse(dayDateString);
        } catch (ParseException e) {
            LOG.error("Date Parsing Error");
            return null;
        }
    }

    private int getNumDaysPerMonth(int month) {
        if (month == 1) {
            return 31;
        } else if (month == 2) {
            return 28;
        } else if (month == 3) {
            return 31;
        } else if (month == 4) {
            return 30;
        } else if (month == 5) {
            return 31;
        } else if (month == 6) {
            return 30;
        } else if (month == 7) {
            return 31;
        } else if (month == 8) {
            return 31;
        } else if (month == 9) {
            return 30;
        } else if (month == 10) {
            return 31;
        } else if (month == 11) {
            return 30;
        } else if (month == 12) {
            return 31;
        } else {
            return 0;
        }
    }

    private int getNumMonthsPerYear() {
        return 12;
    }

    @SuppressWarnings("unused")
    private int getNumDaysPerYear(int year) {
        return 365;
    }

    public Path[] getNeededDailyIndexes() {
        return neededDailyIndexes;
    }

    public Path[] getNeededMonthlyIndexes() {
        return neededMonthlyIndexes;
    }

    public Path[] getNeededYearlyIndexes() {
        return neededYearlyIndexes;
    }

    public Path getDailyIndexesHomePath() {
        return dailyIndexesHomePath;
    }

    public Path getMonthlyIndexesHomePath() {
        return monthlyIndexesHomePath;
    }

    public Path getYearlyIndexesHomePath() {
        return yearlyIndexesHomePath;
    }

    private static void printUsage() {
        System.out.println("Performs a temporal indexing for data stored in hadoop");
        System.out.println("Parameters: (* marks required parameters)");
        System.out.println("<dataset path> - (*) Path to input dataset");
        System.out.println("<index path> - (*) Path to index output");
        System.out.println("time:yyyy.mm.dd..yyyy.mm.dd - (*) Time range");
        System.out.println("-overwrite - Overwrite output file without notice");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    public static void main(String[] args) throws IOException, ParseException {
        // Parse parameters
        OperationsParams params = new OperationsParams(new GenericOptionsParser(args));
        final Path[] paths = params.getPaths();
        if (paths.length <= 1 && !params.checkInput()) {
            printUsage();
            System.exit(1);
        }
        if (paths.length >= 2 && paths[1] == null) {
            printUsage();
            System.exit(1);
        }
        if (params.get("time") == null) {
            System.err.println("You must provide a time range");
            printUsage();
            System.exit(1);
        }

        Path datasetPath = paths[0]; // dataset path
        Path indexesPath = paths[1]; // index path
        String timeRange = params.get("time"); // time range

        TemporalIndexManager temporalIndexManager = new TemporalIndexManager(datasetPath, indexesPath);
        temporalIndexManager.prepareNeededIndexes(timeRange);

        Path[] dailyIndexes = temporalIndexManager.getNeededDailyIndexes();
        System.out.println("Daily Indexes: ");
        for (Path path : dailyIndexes) {
            System.out.println(path.toString());
        }

        System.out.println("Monthly Indexes: ");
        Path[] monthlyIndexes = temporalIndexManager.getNeededMonthlyIndexes();
        for (Path path : monthlyIndexes) {
            System.out.println(path.toString());
        }

        System.out.println("Yearly Indexes: ");
        Path[] yearlyIndexes = temporalIndexManager.getNeededYearlyIndexes();
        for (Path path : yearlyIndexes) {
            System.out.println(path.toString());
        }

    }

}