com.addthis.hydra.task.source.AbstractPersistentStreamSource.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.task.source.AbstractPersistentStreamSource.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.task.source;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import com.addthis.basis.util.Bytes;
import com.addthis.basis.util.Parameter;
import com.addthis.basis.util.Strings;

import com.addthis.codec.Codec;
import com.addthis.codec.CodecJSON;
import com.addthis.hydra.task.stream.PersistentStreamFileSource;
import com.addthis.hydra.task.stream.StreamFileUtil;
import com.addthis.maljson.JSONObject;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An abstract implementation of {@link com.addthis.hydra.task.stream.PersistentStreamFileSource}
 * that provides much of the base functionality required to implement a streaming file source.
 * The main purpose of this class is to parse an input configuration in order to
 * to provide common necessary inputs that concrete implementations require to
 * identify the files from the data source should provide to clients.
 */
public abstract class AbstractPersistentStreamSource implements PersistentStreamFileSource {

    private static final Logger log = LoggerFactory.getLogger(AbstractPersistentStreamSource.class);

    // note that for historical reason these parameters use 'mesh' in their descriptions
    private static final String DEFAULT_DATE_FORMAT = Parameter.value("source.mesh.date.format", "YYMMdd");
    private static final int DEFAULT_SORT_TOKEN_OFFSET = Parameter.intValue("source.mesh.sort.token.offset", 5);
    private static final int DEFAULT_PATH_TOKEN_OFFSET = Parameter.intValue("source.mesh.path.token.offset", 0);
    private static final String DEFAULT_PATH_TOKEN = Parameter.value("source.mesh.path.token", "/");

    public static final long ONE_HOUR_IN_MILLIS = 60 * 60 * 1000;
    public static final long ONE_DAY_IN_MILLIS = 24 * ONE_HOUR_IN_MILLIS;

    private static final String NOW_PREFIX = "{{now";
    private static final String NOW_POSTFIX = "}}";
    public static final String TIME_NOW = "{{now}}";

    /**
     * The format of startDate and endDate values using the
     * <a href="http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html">DateTimeFormat</a>.
     * Default is either "source.mesh.date.format" configuration value or "YYMMdd".
     */
    @Codec.Set(codable = true)
    private String dateFormat = DEFAULT_DATE_FORMAT;

    /**
     * files that have been created before this date will not be processed. Default is {{last}}.
     */
    @Codec.Set(codable = true)
    private String startDate = TIME_NOW;

    /**
     * files that have been created after this date will not be processed. Default is {{now}}.
     */
    @Codec.Set(codable = true)
    private String endDate = TIME_NOW;

    /**
     * If true then process the dates from the most recent date to the earliest date. Default is false.
     */
    @Codec.Set(codable = true)
    protected boolean reverse;

    /**
     * list of file paths to process. This field is required.
     */
    @Codec.Set(codable = true, required = true)
    private String[] files;

    /**
     * When selecting a substring of the input files for either sorting the file names
     * or fetching the file paths then use this token as the path separator.
     * Default is "source.mesh.path.token" configuration value or "/". *
     */
    @Codec.Set(codable = true)
    private String sortToken = DEFAULT_PATH_TOKEN;

    /**
     * shift the sorting suffix by this many characters. Default is 0.
     */
    @Codec.Set(codable = true)
    private int sortOffset;

    /**
     * skip this number of sortToken characters for the sorting suffix. Default is "source.mesh.sort.token.offset" configuration value or 5.
     */
    @Codec.Set(codable = true)
    private int sortTokenOffset = DEFAULT_SORT_TOKEN_OFFSET;

    /**
     * shift the generated file path by this many characters. Default is 0.
     */
    @Codec.Set(codable = true)
    private int pathOffset;

    /* skip this number of sortToken characters for generating file paths. Default is "source.mesh.path.token.offset" configuration value or 0. */
    @Codec.Set(codable = true)
    private int pathTokenOffset = DEFAULT_PATH_TOKEN_OFFSET;

    @Codec.Set(codable = true)
    private int jitterDays = 1;

    @Codec.Set(codable = true)
    private String startDateBaseDir;

    @Codec.Set(codable = true)
    private String dateIncrements;

    /* note: this is based on which files have been opened. If there is a large preOpen queue or many worker threads
     * then multiple days may be open at once, but this setting will assume that the latest day is the one to resume from. */
    @Codec.Set(codable = true)
    private boolean autoResume;

    protected final LinkedList<DateTime> dates = new LinkedList<>();
    protected DateTimeFormatter formatter;
    protected volatile boolean moreData;
    private File stateDir;
    protected File autoResumeFile;
    private final AtomicBoolean running = new AtomicBoolean(true);
    private static final List<String> TIME_CONSTANTS = new ArrayList<>(Arrays.asList("YY", "Y", "M", "D", "H"));

    /**
     * perform any initialization steps specific to the implementing class
     *
     * @return true if initialization was successful
     * @throws IOException
     */
    protected abstract boolean doInit() throws IOException;

    /**
     * perform any shutdown steps specific to the implementing class
     *
     * @throws IOException
     */
    public abstract void doShutdown() throws IOException;

    /**
     * Defines the directory where state for this source will be maintained
     *
     * @param dir
     */
    public void setStateDir(File dir) {
        stateDir = dir;
        autoResumeFile = new File(stateDir, "job.source");
    }

    /**
     * @return true if this source has more data to provide
     */
    public boolean hasMoreData() {
        return moreData;
    }

    /**
     * @return true if the configuration for this source includes a template 'mod' element
     *         that can be used to segment the input stream between n consumers
     */
    public boolean hasMod() {
        for (String file : files) {
            if (file.contains("{{mod")) {
                return true;
            }
        }
        return false;
    }

    /**
     * called by data source wrapper and performs common initialization
     * steps.
     */
    public boolean init(File stateDir, Integer[] shards) throws Exception {
        if (log.isDebugEnabled()) {
            log.debug("SSM: " + CodecJSON.encodeString(this));
        }
        setStateDir(stateDir);
        if (log.isTraceEnabled()) {
            log.trace("shards :: " + Strings.join(shards, " :: "));
        }
        /* expand files list */
        HashSet<String> matches = new HashSet<>();
        if (log.isTraceEnabled()) {
            log.trace("files.1 :: " + Strings.join(files, " -- "));
        }
        /* expand mods */
        for (String file : files) {
            for (Integer shard : shards) {
                matches.add(file.replace("{{mod}}", Strings.padleft(shard.toString(), 3, Strings.pad0)));
            }
        }
        files = matches.toArray(new String[matches.size()]);
        /* expand {US,DE,FR} bash-style string list */
        for (String file : files) {
            int io1 = file.indexOf("{");
            int io2 = file.indexOf("}");
            if (io1 >= 0 && io2 > io1) {
                String left = file.substring(0, io1);
                String right = file.substring(io2 + 1);
                for (String tok : Strings.splitArray(file.substring(io1 + 1, io2), ",")) {
                    // ignore reserved strings for time based expansion
                    if (!TIME_CONSTANTS.contains(tok)) {
                        String expand = left.concat(tok).concat(right);
                        matches.add(expand);
                        if (log.isTraceEnabled()) {
                            log.trace("expand " + file + " to " + expand);
                        }
                    }
                }
            }
        }
        files = matches.toArray(new String[matches.size()]);
        if (log.isTraceEnabled()) {
            log.trace("files.2 :: " + matches);
        }
        if (log.isTraceEnabled()) {
            log.trace("files.3 :: " + Strings.join(files, " -- "));
        }
        /* calculate start/end dates if required */
        formatter = DateTimeFormat.forPattern(dateFormat);
        if (autoResume && autoResumeFile.exists() && autoResumeFile.canRead() && autoResumeFile.length() > 0) {
            try {
                JSONObject jo = new JSONObject(
                        Bytes.toString(Bytes.readFully(new FileInputStream(autoResumeFile))));
                String resumeDate = jo.optString("lastDate");
                if (resumeDate != null) {
                    log.warn("auto resume from " + jo);
                    startDate = resumeDate;
                }
            } catch (Exception ex) {
                log.warn("corrupted autoResume file: " + autoResumeFile + " ... " + ex);
            }
        }

        if (startDate == null) {
            log.warn("No startDate provided.");
            return false;
        }

        DateTime start = parseDateTime(startDate);
        if (endDate == null) {
            endDate = NOW_PREFIX + NOW_POSTFIX;
            log.warn("End Date not provided, using current time: " + endDate + " as end date for job");
        }
        DateTime end = parseDateTime(endDate);
        /* populate date list from start/end */
        fillDateList(start, end);
        log.info("[init] " + start + " to " + end + " = " + dates.size() + " time units");
        return doInit();
    }

    public void setStartTime(long time) {
        startDate = formatter.print(time);
        log.warn("override start date with " + startDate);
    }

    public void shutdown() throws IOException {
        running.set(false);
        doShutdown();
    }

    /**
     * @return a list of dates given the start/end range from the config
     */
    private void fillDateList(DateTime start, DateTime end) {
        DateTime mark = start;
        while (mark.isBefore(end) || mark.isEqual(end)) {
            if (reverse) {
                dates.addFirst(mark);
            } else {
                dates.addLast(mark);
            }
            if ((dateIncrements != null && dateIncrements.equals("DAYS")) || dateFormat.length() == 6) {
                mark = mark.plusDays(1);
            } else if ((dateIncrements != null && dateIncrements.equals("HOURS")) || dateFormat.length() == 8) {
                mark = mark.plusHours(1);
            } else if ((dateIncrements != null && dateIncrements.equals("MONTHS"))) {
                mark = mark.plusMonths(1);
            } else if (dateIncrements == null) {
                log.warn("Non-Standard dateFormat: " + dateFormat + " defaulting to daily time increments\n"
                        + "This can be modified to hourly time increments by setting dateIncrements to 'HOURS'");
                mark = mark.plusDays(1);
            }
        }
    }

    /** */
    private DateTime parseDateTime(String dateString) {
        DateTime time;
        if (dateString.contains(NOW_PREFIX)) {
            // TODO: be better to get this time from a service
            time = new DateTime();
            time = time.plusDays(findDaysOffset(dateString));
        } else {
            time = formatter.parseDateTime(dateString);
        }
        return time;
    }

    /** */
    private int findDaysOffset(String time) {
        int startIndex = time.indexOf(NOW_PREFIX) + 6;
        int endIndex = time.indexOf(NOW_POSTFIX);
        if (startIndex < 0 || endIndex <= startIndex) {
            return 0;
        }
        int offset = Integer.parseInt(time.substring(startIndex, endIndex));
        if (time.charAt(startIndex - 1) == '-') {
            offset = 0 - offset;
        }
        return offset;
    }

    /** */
    private String replaceDateElements(DateTime time, String template) {
        template = template.replace("{YY}", time.year().getAsString());
        template = template.replace("{Y}", getTwoDigit(time.year().get()));
        template = template.replace("{M}", getTwoDigit(time.monthOfYear().get()));
        template = template.replace("{D}", getTwoDigit(time.dayOfMonth().get()));
        template = template.replace("{H}", getTwoDigit(time.hourOfDay().get()));
        if (log.isDebugEnabled()) {
            log.debug("template=" + template);
        }
        return template;
    }

    /** */
    private String getTwoDigit(int value) {
        if (value < 10) {
            return "0".concat(Integer.toString(value));
        }
        if (value > 99) {
            return getTwoDigit(value % 100);
        }
        return Integer.toString(value);
    }

    /** */
    public String[] getDateTemplatedFileList(DateTime timeToLoad) {
        String fileList[] = new String[files.length];
        for (int i = 0; i < files.length; i++) {
            fileList[i] = replaceDateElements(timeToLoad, files[i]);
        }
        return fileList;
    }

    /**
     * return substring getSortOffset into file name
     */
    public String getSortOffset(String name) {
        int sortOff = sortOffset;
        if (sortToken != null && sortTokenOffset > 0) {
            int pos = 0;
            int off = sortTokenOffset;
            while (off-- > 0 && (pos = name.indexOf(sortToken, pos)) >= 0) {
                pos++;
            }
            if (pos > 0) {
                sortOff += pos;
            }
        }
        return name.substring(sortOff);
    }

    /**
     * return substring getSortOffset into file name
     */
    public String getPathOffset(String name) {
        return StreamFileUtil.getCanonicalFileReferenceCacheKey(name, pathOffset, sortToken, pathTokenOffset);
    }

    public String[] getFiles() {
        return files;
    }

    public void setFiles(String[] files) {
        this.files = files;
    }

    public void setSortTokenOffset(int sortTokenOffset) {
        this.sortTokenOffset = sortTokenOffset;
    }

    public void setPathTokenOffset(int pathTokenOffset) {
        this.pathTokenOffset = pathTokenOffset;
    }
}