sadl.input.TimedInput.java Source code

Introduction

Here is the source code for sadl.input.TimedInput.java
Source

/**
 * This file is part of SADL, a library for learning all sorts of (timed) automata and performing sequence-based anomaly detection.
 * Copyright (C) 2013-2016  the original author or authors.
 *
 * SADL is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 *
 * SADL is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with SADL.  If not, see <http://www.gnu.org/licenses/>.
 */
package sadl.input;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Serializable;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.function.Function;

import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import gnu.trove.map.TObjectIntMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import sadl.constants.AnomalyInsertionType;
import sadl.constants.ClassLabel;
import sadl.utils.MasterSeed;

/**
 * Class for reading a set of timed sequences from a file or writing them to a file.
 * 
 * @author Fabian Witter
 * @author Timo Klerx
 *
 */
public class TimedInput implements Iterable<TimedWord>, Serializable {
    // TODO we also need (untimed)Input for PDFAs etc
    private static final long serialVersionUID = -2175576201528760567L;

    private static Logger logger = LoggerFactory.getLogger(TimedInput.class);

    private final TObjectIntMap<String> alphabet = new TObjectIntHashMap<>();
    private final List<String> alphabetRev = new ArrayList<>();
    private List<TimedWord> words = new ArrayList<>();

    private static final String[] parseSymbols = new String[] { "^\\(", "\\)$", "\\)\\s+\\(", "\\s*,\\s*",
            "\\s*:\\s*" };
    private static final String[] parseSymbolsAlt = new String[] { "^\\d+ ", "$", "\\s{2}", "\\s", "\\s*:\\s*" };
    private static final int parseStart = 0;
    private static final int parseStartAlt = 1;

    public TimedInput(List<TimedWord> words) {

        this.words.addAll(words);
        for (final TimedWord w : words) {
            if (w.getClass() == TimedWord.class) {
                for (final String s : w.symbols) {
                    if (!alphabet.containsKey(s)) {
                        alphabet.put(s, alphabet.size());
                        alphabetRev.add(s);
                    }
                }
            }
        }
    }

    public TimedInput(String[] alphabet) {

        for (int i = 0; i < alphabet.length; i++) {
            this.alphabet.put(alphabet[i], i);
            alphabetRev.add(alphabet[i]);
        }
    }

    /**
     * Tries to parse sequences from a file in the old or new format. If both fail, an exception is thrown.
     * 
     * @param in
     *            A {@link Path} that contains timed sequences in the appropriate format
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     * @implNote The {@link #parse(Path)} and {@link #parseAlt(Path)} methods are called.
     */
    public static TimedInput tryParse(Path in) throws IOException {
        try {
            return parse(in);
        } catch (final IOException | IllegalArgumentException e) {
            return parseAlt(in);
        }
    }

    // TODO maybe add parsing for anomaly type?!
    /**
     * Parses timed sequences from a file. Each line contains exactly one of those sequences that have the following format:
     * 
     * <pre>
     * {@code (s_1,t_1) (s_2,t_2) ... (s_n,t_n) : label}
     * </pre>
     * 
     * where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type {@code int}. The label at the end
     * is of type {@link ClassLabel} and is optional.
     * 
     * @param in
     *            A {@link Path} that contains timed sequences in the appropriate format
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parse(Path in) throws IOException {
        return parseCustom(in, parseStart, parseSymbols[0], parseSymbols[1], parseSymbols[2], parseSymbols[3],
                parseSymbols[4]);
    }

    /**
     * Parses timed sequences from a file. Each line contains exactly one of those sequences that have the following format:
     * 
     * <pre>
     * {@code (s_1,t_1) (s_2,t_2) ... (s_n,t_n) : label}
     * </pre>
     * 
     * where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type {@code int}. The label at the end
     * is of type {@link ClassLabel} and is optional.
     * 
     * @param br
     *            A {@link InputStream} that contains timed sequences in the appropriate format
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parse(Reader br) throws IOException {
        return parse(br, false);
    }

    public static TimedInput parse(Reader br, boolean skipFirstElement) throws IOException {
        return parseCustom(br, parseStart, parseSymbols[0], parseSymbols[1], parseSymbols[2], parseSymbols[3],
                parseSymbols[4], skipFirstElement);

    }

    /**
     * Parses timed sequences from a file that has the following alternative format:
     * 
     * <pre>
     * {@code x y}
     * {@code n s_1 t_1  s_2 t_2 ... s_n t_n : label}
     * :
     * :
     * </pre>
     * 
     * where @{@code x} is the number of sequences contained in that file and @ {@code y} the number of different symbols contained. Each of the following lines
     * contains exactly one sequence, where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type
     * {@link Integer}. The {@code n} at the beginning of the line is the number of symbols in that sequence. Note that between {@code t_i} and {@code s_(i+1)}
     * is a double space.The label at the end is of type {@link ClassLabel} and is optional.
     * 
     * @param in
     *            A {@link Path} that contains timed sequences in the appropriate alternative format
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseAlt(Path in) throws IOException {
        return parseAlt(in, parseStartAlt);
    }

    /**
     * Parses timed sequences from a file that has the following alternative format:
     * 
     * <pre>
     * {@code x y}
     * {@code n s_1 t_1  s_2 t_2 ... s_n t_n : label}
     * :
     * :
     * </pre>
     * 
     * where @{@code x} is the number of sequences contained in that file and @ {@code y} the number of different symbols contained. Each of the following lines
     * contains exactly one sequence, where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type
     * {@link Integer}. The {@code n} at the beginning of the line is the number of symbols in that sequence. Note that between {@code t_i} and {@code s_(i+1)}
     * is a double space.The label at the end is of type {@link ClassLabel} and is optional.
     * 
     * @param br
     *            A {@link BufferedReader} that contains timed sequences in the appropriate alternative format
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseAlt(Reader br) throws IOException {
        return parseAlt(br, parseStartAlt);
    }

    /**
     * Parses timed sequences from a file that has the following alternative format:
     * 
     * <pre>
     * {@code x y}
     * {@code n s_1 t_1  s_2 t_2 ... s_n t_n : label}
     * :
     * :
     * </pre>
     * 
     * where @{@code x} is the number of sequences contained in that file and @ {@code y} the number of different symbols contained. Each of the following lines
     * contains exactly one sequence, where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type
     * {@link Integer}. The {@code n} at the beginning of the line is the number of symbols in that sequence. Note that between {@code t_i} and {@code s_(i+1)}
     * is a double space.The label at the end is of type {@link ClassLabel} and is optional.
     * 
     * @param in
     *            A {@link Path} that contains timed sequences in the appropriate alternative format
     * @param lineOffset
     *            The number of lines that will be skipped at the beginning of the file because they contain a header with meta data
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseAlt(Path in, int lineOffset) throws IOException {
        return parseCustom(in, lineOffset, parseSymbolsAlt[0], parseSymbolsAlt[1], parseSymbolsAlt[2],
                parseSymbolsAlt[3], parseSymbolsAlt[4]);
    }

    /**
     * Parses timed sequences from a file that has the following alternative format:
     * 
     * <pre>
     * {@code x y}
     * {@code n s_1 t_1  s_2 t_2 ... s_n t_n : label}
     * :
     * :
     * </pre>
     * 
     * where @{@code x} is the number of sequences contained in that file and @ {@code y} the number of different symbols contained. Each of the following lines
     * contains exactly one sequence, where {@code s_i} is an (event) symbol of type {@link String} and {@code t_i} is the corresponding time delay of type
     * {@link Integer}. The {@code n} at the beginning of the line is the number of symbols in that sequence. Note that between {@code t_i} and {@code s_(i+1)}
     * is a double space.The label at the end is of type {@link ClassLabel} and is optional.
     * 
     * @param br
     *            A {@link BufferedReader} that contains timed sequences in the appropriate alternative format
     * @param lineOffset
     *            The number of lines that will be skipped at the beginning of the file because they contain a header with meta data
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseAlt(Reader br, int lineOffset) throws IOException {
        return parseCustom(br, lineOffset, parseSymbolsAlt[0], parseSymbolsAlt[1], parseSymbolsAlt[2],
                parseSymbolsAlt[3], parseSymbolsAlt[4]);
    }

    /**
     * Parses timed sequences from a file in a custom format:
     * 
     * @param in
     *            A {@link Path} that contains timed sequences in the appropriate alternative format
     * @param lineOffset
     *            The number of lines that will be skipped at the beginning of the file because they contain a header with meta data
     * @param seqPrefix
     *            A regular expression that matches the prefix of each sequence; after removing the prefix the line must begin with the first symbol
     *            {@link String}
     * @param seqPostfix
     *            A regular expression that matches the postfix of each sequence (until the regular expression in {@code classSep} appears); after removing the
     *            postfix the line must end with the last time delay.
     * @param pairSep
     *            A regular expression that matches the separator between two value pairs in a sequence; must not be a substring of {@code valueSep}
     * @param valueSep
     *            A regular expression that matches the separator between two values of a pair in a sequence
     * @param classSep
     *            A regular expression that matches the separator between the sequence and the optional class label of a sequence; must not be a substring of
     *            {@code pairSep}
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseCustom(Path in, int lineOffset, String seqPrefix, String seqPostfix,
            String pairSep, String valueSep, String classSep) throws IOException {
        if (Files.notExists(in)) {
            logger.warn("File {} was not found.", in);
            throw new FileNotFoundException("input file on path " + in.toAbsolutePath() + " was not found");
        }
        try (BufferedReader br = Files.newBufferedReader(in)) {
            return parseCustom(br, lineOffset, seqPrefix, seqPostfix, pairSep, valueSep, classSep);
        }
    }

    /**
     * Parses timed sequences from a file in a custom format:
     * 
     * @param br
     *            A {@link BufferedReader} that contains timed sequences in the appropriate alternative format
     * @param lineOffset
     *            The number of lines that will be skipped at the beginning of the file because they contain a header with meta data
     * @param seqPrefix
     *            A regular expression that matches the prefix of each sequence; after removing the prefix the line must begin with the first symbol
     *            {@link String}
     * @param seqPostfix
     *            A regular expression that matches the postfix of each sequence (until the regular expression in {@code classSep} appears); after removing the
     *            postfix the line must end with the last time delay.
     * @param pairSep
     *            A regular expression that matches the separator between two value pairs in a sequence; must not be a substring of {@code valueSep}
     * @param valueSep
     *            A regular expression that matches the separator between two values of a pair in a sequence
     * @param classSep
     *            A regular expression that matches the separator between the sequence and the optional class label of a sequence; must not be a substring of
     *            {@code pairSep}
     * @return A {@link TimedInput} that represents the timed sequences parsed
     * @throws IOException
     */
    public static TimedInput parseCustom(Reader br, int lineOffset, String seqPrefix, String seqPostfix,
            String pairSep, String valueSep, String classSep) throws IOException {
        return parseCustom(br, lineOffset, seqPrefix, seqPostfix, pairSep, valueSep, classSep, false);
    }

    public static TimedInput parseCustom(Reader br, int lineOffset, String seqPrefix, String seqPostfix,
            String pairSep, String valueSep, String classSep, boolean skipFirstElement) throws IOException {
        final String pre = !seqPrefix.startsWith("^") ? "^" + seqPrefix : seqPrefix;
        final String post = !seqPostfix.endsWith("$") ? seqPostfix + "$" : seqPostfix;

        return new TimedInput(br, lineOffset, pre, post, pairSep, valueSep, classSep, skipFirstElement);
    }

    private TimedInput(Reader br, int lineOffset, String seqPrefix, String seqPostfix, String pairSep,
            String valueSep, String classSep, boolean skipFirstElement) throws IOException {
        loadData(br, lineOffset, seqPrefix, seqPostfix, pairSep, valueSep, classSep, skipFirstElement);
    }

    private void loadData(Reader br, int lineOffset, String seqPrefix, String seqPostfix, String pairSep,
            String valueSep, String classSep, boolean skipFirstElement) throws IOException {

        try (BufferedReader in = new BufferedReader(br)) {

            // Skip offset lines at the beginning of the file
            int counter = 0;
            while (counter < lineOffset) {
                in.readLine();
                counter++;
            }

            TimedWord word;
            String symbol;
            int timeDelay;
            String line;
            String[] splitWord;
            String[] splitPair;
            int lineCount = 0;
            while ((line = in.readLine()) != null) {
                if (line.isEmpty()) {
                    continue;
                }
                word = new TimedWord();

                // Split and parse class label (if it exists)
                splitWord = line.split(classSep, 2);
                line = splitWord[0];
                ClassLabel label;
                if (splitWord.length == 2) {
                    switch (splitWord[1]) {
                    case "0":
                        label = ClassLabel.NORMAL;
                        break;
                    case "1":
                        label = ClassLabel.ANOMALY;
                        break;
                    default:
                        label = ClassLabel.NORMAL;
                        break;
                    }
                    word.setLabel(label);
                }
                line = line.trim();
                // Remove sequence prefix
                line = line.replaceAll(seqPrefix, "");

                // Remove sequence postfix
                line = line.replaceAll(seqPostfix, "");
                if (!line.isEmpty()) {
                    // Parse sequence
                    splitWord = line.split(pairSep);
                    int i = 0;
                    if (skipFirstElement) {
                        i = 1;
                    }
                    for (; i < splitWord.length; i++) {
                        splitPair = splitWord[i].split(valueSep, 2);
                        if (splitPair.length < 2) {
                            final String errorMessage = "Pair \"" + splitWord[i] + "\" in line " + lineCount
                                    + " is in the wrong format. Separator \"" + valueSep + "\" not found!";
                            final IllegalArgumentException e = new IllegalArgumentException(errorMessage);
                            logger.error(errorMessage, e);
                            throw e;
                        }
                        symbol = splitPair[0];
                        if (symbol.matches("\\W")) {
                            // Only characters, digits and underscores are allowed for
                            // event names ([a-zA-Z_0-9])
                            final String errorMessage = "Event name \"" + symbol + "\" in line " + lineCount
                                    + " contains forbidden characters. " + "Only [a-zA-Z_0-9] are allowed.";
                            final IllegalArgumentException e = new IllegalArgumentException(errorMessage);
                            logger.error(errorMessage, e);
                            throw e;
                        }
                        timeDelay = Integer.parseInt(splitPair[1].trim());
                        if (!alphabet.containsKey(symbol)) {
                            alphabet.put(symbol, alphabet.size());
                            alphabetRev.add(symbol);
                        }
                        // Use String in alphabet to avoid redundant event name
                        // instances in input
                        word.appendPair(alphabetRev.get(alphabet.get(symbol)), timeDelay);
                    }
                }
                words.add(word);
                lineCount++;
            }
            br.close();
        }
    }

    /**
     * States whether the {@link TimedInput} contains any timed sequences or not.
     * 
     * @return {@code true} if and only if the {@link TimedInput} contains at least one timed sequence
     */
    public boolean isEmpty() {
        return words.isEmpty();
    }

    private boolean cleared = false;

    /**
     * Removes all {@link TimedWord}s from the {@link TimedInput} to reduce memory consumption.
     */
    public void clearWords() {
        words.clear();
        cleared = true;
    }

    /**
     * Returns the {@link TimedWord} at the given index.
     * 
     * @param i
     *            The index to get the {@link TimedWord} for
     * @return The {@link TimedWord} at the given index or {@code null} if the index does not exist
     */
    public TimedWord getWord(int i) {
        return get(i);
    }

    /**
     * Returns the {@link TimedWord} at the given index.
     * 
     * @param i
     *            The index to get the {@link TimedWord} for
     * @return The {@link TimedWord} at the given index or {@code null} if the index does not exist
     */
    public TimedWord get(int i) {
        checkCleared();
        return words.get(i);
    }

    /**
     * Returns the number of timed sequences contained in the {@link TimedInput} .
     * 
     * @return The number of timed sequences
     */
    public int size() {
        checkCleared();
        return words.size();
    }

    /**
     * Returns the number of distinct symbol {@link String}s contained in the {@link TimedInput}.
     * 
     * @return The number of distinct symbols
     */
    public int getAlphSize() {
        return alphabet.size();
    }

    /**
     * Returns the symbol {@link String} with the given index from the {@link TimedInput}.
     * 
     * @param i
     *            The index to get the {@link TimedWord} for
     * @return The symbol with the given index or {@code null} if the index does not exist
     */
    public String getSymbol(int i) {
        return alphabetRev.get(i);
    }

    /**
     * Returns the index of a given symbol {@link String} in the {@link TimedInput}.
     * 
     * @param s
     *            The symbol to get the index for
     * @return The index for the given symbol or {@code -1} if the symbol is not contained in the {@link TimedInput}
     */
    public int getAlphIndex(String s) {
        if (alphabet.containsKey(s)) {
            return alphabet.get(s);
        }
        return -1;
    }

    public String[] getSymbols() {
        final String[] result = alphabet.keys(new String[0]);
        Arrays.sort(result);
        return result;
    }

    /**
     * Returns the {@link String} representation of the {@link TimedInput} in standard format with class labels.
     * 
     * @return The {@link String} representation of the {@link TimedInput}
     * @see TimedWord#toString(boolean)
     * @see TimedInput#parse(Path)
     */
    @Override
    public String toString() {
        final StringBuilder builder = new StringBuilder();
        try {
            toFile(builder, true);
            return builder.toString();
        } catch (final IOException e) {
            logger.error("Unexpected exception", e);
            throw new RuntimeException(e);
        }
    }

    /**
     * Writes the {@link String} representation of the {@link TimedInput} in standard format as it is required for parsing with {@link TimedInput#parse(Path)}.
     * 
     * @param bw
     *            The writer where to write the {@link String} representation
     * @param withClassLabel
     *            If {@code true} the {@link ClassLabel} will be appended at the end of each timed sequence
     * @see TimedWord#toString(boolean)
     * @see TimedInput#parse(Path)
     */
    public void toFile(Appendable bw, boolean withClassLabel) throws IOException {
        toFile(bw, word -> word.toString(withClassLabel));
    }

    private void toFile(Appendable a, Function<TimedWord, String> f) throws IOException {
        checkCleared();
        for (int i = 0; i < words.size(); i++) {
            a.append(f.apply(words.get(i)));
            if (i < (words.size() - 1)) {
                a.append('\n');
            }
        }
    }

    /**
     * Writes the {@link String} representation of the {@link TimedInput} in alternative format as it is required for parsing with
     * {@link TimedInput#parseAlt(Path)}.
     * 
     * @param bw
     *            The writer where to write the {@link String} representation
     * @param withClassLabel
     *            If {@code true} the {@link ClassLabel} will be appended at the end of each timed sequence
     * @see TimedWord#toString(boolean)
     * @see TimedInput#parseAlt(Path)
     */
    public void toFileAlt(Appendable bw, boolean withClassLabel) throws IOException {
        bw.append(Integer.toString(words.size()));
        bw.append(' ');
        bw.append(Integer.toString(alphabet.size()));
        bw.append('\n');
        toFile(bw, word -> word.toStringAlt(withClassLabel));
    }

    @Override
    public Iterator<TimedWord> iterator() {
        checkCleared();
        return new Iterator<TimedWord>() {
            int i = 0;

            @Override
            public boolean hasNext() {
                return i < size();
            }

            @Override
            public TimedWord next() {
                i++;
                return getWord(i - 1);
            }
        };
    }

    protected void checkCleared() {
        if (cleared) {
            throw new IllegalStateException("Input was cleared before");
        }
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((alphabet == null) ? 0 : alphabet.hashCode());
        result = prime * result + ((alphabetRev == null) ? 0 : alphabetRev.hashCode());
        result = prime * result + ((words == null) ? 0 : words.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        final TimedInput other = (TimedInput) obj;
        if (alphabet == null) {
            if (other.alphabet != null) {
                return false;
            }
        } else if (!alphabet.equals(other.alphabet)) {
            return false;
        }
        if (alphabetRev == null) {
            if (other.alphabetRev != null) {
                return false;
            }
        } else if (!alphabetRev.equals(other.alphabetRev)) {
            return false;
        }
        if (words == null) {
            if (other.words != null) {
                return false;
            }
        } else if (!words.equals(other.words)) {
            return false;
        }
        return true;
    }

    /**
     * Decreases the size of the timed input.
     * 
     * @param d
     *            the value to decrease the sample size. Must be between 0 and 1
     */
    public void decreaseSamples(double d) {
        if (d > 0 && d < 1) {
            words = new ArrayList<>(words.subList(0, (int) (words.size() * d)));
        }
    }

    public List<TimedWord> getWords() {
        return Collections.unmodifiableList(words);
    }

    Random r = null;

    public TimedInput insertRandomAnomalies(AnomalyInsertionType type, double anomalyPercentage) {
        if (r == null) {
            r = MasterSeed.nextRandom();
        }
        final TimedInput result = SerializationUtils.clone(this);
        final int size = result.size();
        final int requestedAnomalies = (int) (size * anomalyPercentage);
        final TIntSet anomalyIndexes = new TIntHashSet();
        while (anomalyIndexes.size() < requestedAnomalies) {
            int index = r.nextInt(anomalyIndexes.size());
            while (anomalyIndexes.contains(index)) {
                index = r.nextInt(anomalyIndexes.size());
            }

            if (type == AnomalyInsertionType.TYPE_ONE) {

            } else if (type == AnomalyInsertionType.TYPE_TWO) {

            } else if (type == AnomalyInsertionType.TYPE_THREE) {

            } else if (type == AnomalyInsertionType.TYPE_FOUR) {

            } else if (type == AnomalyInsertionType.TYPE_FIVE) {

            } else if (type == AnomalyInsertionType.ALL) {

            }
            anomalyIndexes.add(index);
        }
        return result;
    }
}