dk.netarkivet.common.utils.batch.ExternalBatchMoreClasses.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.common.utils.batch.ExternalBatchMoreClasses.java

Source

/*
 * #%L
 * Netarchivesuite - common - test
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */

package dk.netarkivet.common.utils.batch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.OutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;

/**
 * A batch job which returns following statistical information about all files in the bitarchive in which it runs: -
 * short form of metadata filename - date of creation taken from arc-file header - date taken from cdx part - date taken
 * from lines starting on form YYYY-MM-DDT - date taken from process report line - section with statistical data taken
 * from the arc-file consisting of <#urls> <#bytes> <mime-types>
 * <p>
 * <p>
 * This is the class which is used to generate the two jarfiles ExternalBatchSeveralClassesNoPackage.jar
 * ExternalBatchSeveralClassesWithPackage.jar
 */

@SuppressWarnings({ "serial" })
public class ExternalBatchMoreClasses extends FileBatchJob {
    protected transient Log log = LogFactory.getLog(getClass().getName());

    /**
     * Initializes fields in this class.
     *
     * @param os the OutputStream to which data is to be written
     */
    public void initialize(OutputStream os) {

    }

    /**
     * Invoke default method for deserializing object, and reinitialise the logger.
     *
     * @param s
     */
    private void readObject(ObjectInputStream s) {
        try {
            s.defaultReadObject();
        } catch (Exception e) {
            throw new IOFailure("Unexpected error during deserialization", e);
        }
        log = LogFactory.getLog(getClass().getName());
    }

    /**
     * Type to indicate where date was found in metadata-file 0. filedesc, 1. cdx, 2. on form YYYY-MM-DDT, 3. Processors
     * report
     */
    private static enum DateFoundType {
        FIRSTLINE, CDX, FORMATYMD, PROCREPORT
    }

    ;

    /**
     * Type to include both a date and where it was found in metadata-file
     */
    private static class DateAndFoundType {
        String date = "";
        int foundType = -1; // corresponding to DateFoundType ordinal
    }

    ;

    /**
     * Checks whether a string only contains digits will return true
     *
     * @param s String to be checked
     * @return true if the given string only contains digits (or is an empty line), false otherwise
     */
    private static boolean isStringNumeric(String s) {
        boolean ok = true;
        int l = s.length();
        int i = 0;
        while (i < l && ok) {
            char c = s.charAt(i);
            ok = Character.isDigit(c);
            i++;
        }
        return ok;
    }

    /**
     * Remove all extra occurrances in a String, i.e. a " " will only occur once at a time and there are no start or end
     * space.
     *
     * @param line to be trimmed for spaces
     * @return string as result of trimmed line
     */
    private String extractTrimedLine(String line) {
        // split on " " and ignore empty strings
        String[] parts = line.split(" ");
        String res = "";
        for (int i = 0; i < parts.length; i++) {
            if (parts[i].length() != 0) {
                res = (res.length() == 0 ? "" : res + " ") + parts[i];
            } // else ignore
        }
        return res;
    }

    /**
     * Looks for date in given line. This can either be - a date of creation taken from arc-file header - date taken
     * from cdx part - date taken from lines starting on form YYYY-MM-DDT - date taken from process report line If a
     * date is found then the given date array is updated The return value tells whether a date was found.
     *
     * @param line to be searched
     * @param firstLine true if it is the first line of the metadata-file
     * @param
     * @return true if date was found, false otherwise
     */
    public static DateAndFoundType lookForDate(String line, String mdFile, boolean firstLine) {

        DateAndFoundType resDateFound = new DateAndFoundType();
        resDateFound.date = "";
        resDateFound.foundType = -1;

        if (firstLine) {
            // Find date from first line which is on form:
            // filedesc://<filname> <#.#.#.#> YYYYMMDDHHMMSS ...
            if (line.indexOf("filedesc://") == 0) {
                int i = line.indexOf(" ", 8); // find 1. " "
                if (line.length() > i + 1) {
                    i = line.indexOf(" ", i + 1); // find 2. " "
                }
                if (line.length() >= i + 1 + 14) {
                    // Check whether the date consist of digits
                    String s = line.substring(i + 1, i + 9);
                    if (isStringNumeric(s) && (s.length() > 0)) {
                        // Check whether it is a metadata-2 file, meaning that the date is
                        // not trustworthy
                        if (mdFile.indexOf("-2") < 0) {
                            resDateFound.date = s;
                            resDateFound.foundType = DateFoundType.FIRSTLINE.ordinal();
                        }
                    }
                }
            }
        } else {
            // cdx -line on form
            // metadata://netarkivet.dk/crawl/index/cdx"
            if (line.indexOf("metadata://netarkivet.dk/crawl/index/cdx") >= 0) {
                int pos = line.indexOf("timestamp=");
                if (pos > 0 && line.length() >= pos + 14 + "timestamp=".length() + 1) {
                    resDateFound.date = line.substring(pos + "timestamp=".length(),
                            pos + 8 + "timestamp=".length());
                    resDateFound.foundType = DateFoundType.CDX.ordinal();
                }
            }
            // on form YYYY-MM-DDT...
            if (line.indexOf("http://") > 0) {
                if (line.indexOf("-") == 4 && line.indexOf("T") == 10 && line.length() > "2007-04-09T".length()) {
                    String year = line.substring(0, 4);
                    String month = line.substring(5, 7);
                    String delim = line.substring(7, 8);
                    String day = line.substring(8, 10);
                    if (isStringNumeric(year) && isStringNumeric(month) && isStringNumeric(day)
                            && delim.equals("-")) {
                        resDateFound.date = year + month + day;
                        resDateFound.foundType = DateFoundType.FORMATYMD.ordinal();
                    }
                }
            }
            // Processors report
            int pos = line.indexOf("Processors report - ");
            if (pos >= 0 && line.length() >= pos + 12) {
                resDateFound.date = line.substring(pos + "Processors report - ".length(),
                        pos + "Processors report - ".length() + 8);
                resDateFound.foundType = DateFoundType.PROCREPORT.ordinal();
            }
        }
        return resDateFound;
    }

    /**
     * Collects statistical information about files from a metadata arc file and dates from different parts of the
     * metadatafile.
     *
     * @param file processing file
     * @return string with dates and extracted statistical information
     */
    private String readStatInfoFromFile(File file, String mdFileName) {
        /* File to read */
        BufferedReader in = null;

        /*
         * Array of read dates on 8 characters (YYYYMMDD). /* Each array position indicates how the date was found
         * (according to type DateFoundType) 0. in header, 1. cdx, 2. on form YYYY-MM-DDT, 3. Processors report
         */
        String[] fileDates = new String[4];

        /* String with read statistical information */
        String statInfo = "";

        fileDates[DateFoundType.FIRSTLINE.ordinal()] = "";
        fileDates[DateFoundType.CDX.ordinal()] = "";
        fileDates[DateFoundType.FORMATYMD.ordinal()] = "";
        fileDates[DateFoundType.PROCREPORT.ordinal()] = "";

        try {
            try {
                in = new BufferedReader(new FileReader(file));
                String line;

                /* Look for date on first line of arc-file */
                if ((line = in.readLine()) != null) {
                    DateAndFoundType dateFound = lookForDate(line, mdFileName, true);
                    if (dateFound.foundType >= 0) {
                        fileDates[dateFound.foundType] = dateFound.date;
                    }
                }

                /*
                 * Read heritrix stat data from arc-file where data is found in file section beginning with a heading
                 * that contains the text '[mime-types]' and [#urls], as f.ex. [#urls] [#bytes] [mime-types] or
                 * [mime-types] [#urls] [#bytes] The section continues with lines with sets of values and ending with an
                 * empty line or line not split into more than two parts. Note that mime-types can include spaces.
                 * 
                 * Furthermore try to look for date elsewhere
                 */
                Boolean goOn = true;
                while ((goOn && (line = in.readLine()) != null)) {
                    // look for date
                    DateAndFoundType dateFound = lookForDate(line, mdFileName, false);
                    if (dateFound.foundType >= 0) {
                        fileDates[dateFound.foundType] = dateFound.date;
                    }

                    // look for header of statistical data
                    if (line.indexOf("[mime-types]") >= 0 && line.indexOf("[#urls]") >= 0) {
                        String trimLine = extractTrimedLine(line);

                        // set positions of data according to header
                        String[] lineParts = trimLine.split(" ");
                        int mimePos = 0;
                        int noFields = lineParts.length;
                        String header = "";
                        if (noFields != 3) {
                            header = "Header do not contain 3 header items: '" + line + "'\n";
                        } else {
                            for (int i = 0; i < 3; i++) {
                                if (lineParts[i].equalsIgnoreCase("[#urls]")
                                        || lineParts[i].equalsIgnoreCase("[#bytes]")
                                        || lineParts[i].equalsIgnoreCase("[mime-types]")) {

                                    if (lineParts[i].equalsIgnoreCase("[mime-types]")) {
                                        mimePos = i;
                                    }
                                } else {
                                    header = "Header do not contain expected headers (unknown name): '" + line
                                            + "'\n";
                                }
                            }
                            if (header.length() == 0) {
                                header = trimLine;
                            }
                        }
                        statInfo = statInfo + header + "\n";

                        // read rest of stat data
                        while ((goOn && (line = in.readLine()) != null)) {
                            String statLine = "";
                            trimLine = extractTrimedLine(line);
                            lineParts = trimLine.split(" ");

                            if (trimLine.length() == 0) {
                                statLine = "\n";
                                goOn = false;
                            }
                            if (noFields != 3 && statLine.length() == 0) {
                                // not seen in first run, but we cannot handle
                                // special cases, because we do not know the types
                                if (noFields != lineParts.length) {
                                    goOn = false;
                                    statLine = "Ended with non-empty line (no colums !=3): '" + line + "'\n";
                                } else {
                                    statLine = trimLine + "\n";
                                }
                            }
                            if (lineParts.length == 1 && statLine.length() == 0) {
                                // we do not know what is going on
                                statLine = "Ended with non-empty line (expected 3 found 1): '" + line + "'\n";
                            }
                            if (lineParts.length == 3 && statLine.length() == 0) {
                                // all in order
                                statLine = trimLine + "\n";
                            }
                            if (lineParts.length == 2 && statLine.length() == 0) {
                                // there may be null mimetypes
                                // check that the 2 contains numbers
                                boolean ok = isStringNumeric(lineParts[0]);
                                ok = ok && isStringNumeric(lineParts[1]);

                                if (ok) {
                                    String mime = "<blank-mime>";
                                    boolean mimePassed = false;
                                    for (int i = 0; i < 3; i++) {
                                        if (i == mimePos) {
                                            statLine = statLine + mime + " ";
                                            mimePassed = true;
                                        } else {
                                            statLine = statLine + lineParts[i - (mimePassed ? 1 : 0)] + " ";
                                        }
                                    }
                                    statLine = statLine.substring(0, statLine.length() - 1) + "\n";
                                } else {
                                    // we cannot recognize it
                                    statLine = "Ended with non-empty line (expected 2 numbers): '" + line + "'\n";
                                }
                            }
                            if (lineParts.length > 3 && statLine.length() == 0) {
                                // there may be mimetypes with spaces
                                boolean ok = true;
                                if (mimePos == 0) {
                                    ok = ok && isStringNumeric(lineParts[lineParts.length - 1]);
                                    ok = ok && isStringNumeric(lineParts[lineParts.length - 2]);
                                } else {
                                    if (mimePos == 1) {
                                        ok = ok && isStringNumeric(lineParts[lineParts.length - 1]);
                                        ok = ok && isStringNumeric(lineParts[0]);
                                    } else {
                                        if (mimePos == 2) {
                                            ok = ok && isStringNumeric(lineParts[0]);
                                            ok = ok && isStringNumeric(lineParts[1]);
                                        }
                                    }
                                }
                                if (ok) {
                                    if ((mimePos == 1) || (mimePos == 2)) {
                                        statLine = statLine + " " + lineParts[0];
                                        if (mimePos == 2) {
                                            statLine = statLine + " " + lineParts[1];
                                        }
                                    }
                                    int mimeEndPos = 0;
                                    mimeEndPos = lineParts.length - 3 + mimePos;
                                    for (int i = mimePos; i < mimeEndPos; i++) {
                                        statLine = statLine + (i == mimePos ? "" : "###") + lineParts[i];
                                    }
                                    if (mimePos == 0) {
                                        statLine = statLine + " " + lineParts[lineParts.length - 2];
                                        statLine = statLine + " " + lineParts[lineParts.length - 1];
                                    }
                                } else {
                                    // we cannot recognize it
                                    statLine = "Ended with non-empty line (expected numbers in 2 where mimepos was '"
                                            + mimePos + "'): '" + line + "'\n";
                                }
                            }
                            if (statLine.length() == 0) {
                                statLine = "Ended with non-empty line (nothing was calculated - internal error): '"
                                        + line + "'\n";
                            }
                            statInfo = statInfo + statLine;
                        } // while read next line of statlines
                    } // header
                }
            } finally {
                if (in != null) {
                    in.close();
                }
            }
        } catch (IOException e) {
            String msg = "Could not read data from " + file.getAbsolutePath();
            log.warn(msg, e);
            throw new IOFailure(msg, e);
        }

        /* Log (and write if unexpected data occured */
        String fileDateTxt = "";
        boolean dateFound = false;
        for (int i = DateFoundType.FIRSTLINE.ordinal(); i <= DateFoundType.FIRSTLINE.ordinal(); i++) {
            dateFound = dateFound || (fileDates[i].length() > 0);
        }
        if (dateFound) {
            fileDateTxt = fileDates[DateFoundType.FIRSTLINE.ordinal()] + ","
                    + fileDates[DateFoundType.CDX.ordinal()] + "," + fileDates[DateFoundType.FORMATYMD.ordinal()]
                    + "," + fileDates[DateFoundType.PROCREPORT.ordinal()];
        } else {
            fileDateTxt = "Could not read arc file date from " + file.getAbsolutePath();
        }

        if (statInfo.length() == 0) {
            statInfo = "Could not read statistics from " + file.getAbsolutePath() + "\n";
        }

        /* return result */
        return fileDateTxt + "\n" + statInfo;
    }

    /**
     * Writes file, date and statistical data from a metadata arcfile to the OutputStream. This data will be on form:
     * <metadata arc file name>,<date0>,<date1>,<date2>,<date3> <section with statistical data taken from the arc-file>
     * ending with and empty line. Here - <metadata arc file name> is on form <job no.>"-metadata-"<no.>".arc" - <datei>
     * are dates on form YYYYMMDD, found: 0. in header, 1. cdx, 2. on form YYYY-MM-DDT, 3. Processors report if no date
     * where found, it will be represented by the empty string. - <section with statistical data taken from the
     * arc-file> starts with: <#urls> <#bytes> <mime-types> and is followed by numbers and tekst according to theis
     * header.
     *
     * @param file an arcfile
     * @param os the OutputStream to which data is to be written
     * @return false If listing of this arcfile fails; otherwise true
     */
    public boolean processFile(File file, OutputStream os) {
        ArgumentNotValid.checkNotNull(file, "file");

        String result = "";
        String mdFileName = "";

        // Read arc file name
        String name = file.getName(); // arc file name

        // Check it is a metadata file
        if (name.indexOf("metadata") == -1) {
            return true; // ignore
        } else {
            mdFileName = name.replace("metadata-", "");
            mdFileName = mdFileName.replace(".arc", "");
        }

        // Read statistics from metadata arc file
        result = mdFileName + "," + readStatInfoFromFile(file, mdFileName) + "\n";

        // Write result on output stream
        try {
            os.write(result.getBytes());
        } catch (IOException e) {
            log.warn("File stat info " + file.getName() + ", ... " + " failed: ", e);
            return false;
        }
        return true;
    }

    /**
     * Does nothing.
     *
     * @param os the OutputStream to which data is to be written
     */
    public void finish(OutputStream os) {
    }

    public String toString() {
        int n_failed;
        if (filesFailed == null) {
            n_failed = 0;
        } else {
            n_failed = filesFailed.size();
        }
        return ("\nFileList job:\nFiles Processed = " + noOfFilesProcessed + "\nFiles  failed = " + n_failed);
    }
}