org.archive.io.CrawlerJournal.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.io.CrawlerJournal.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.mg4j.util.MutableString;

import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.TextUtils;

/**
 * Utility class for a crawler journal/log that is compressed and 
 * rotates by serial number at checkpoints. 
 * 
 * @author gojomo
 */
public class CrawlerJournal implements Closeable {
    private static final Logger LOGGER = Logger.getLogger(CrawlerJournal.class.getName());

    /** prefix for error lines*/
    public static final String LOG_ERROR = "E ";
    /** prefix for timestamp lines */
    public static final String LOG_TIMESTAMP = "T ";

    /**
     * Stream on which we record frontier events.
     */
    protected Writer out = null;

    /** line count */
    protected long lines = 0;
    /** number of lines between timestamps */
    protected int timestamp_interval = 0; // 0 means no timestamps

    /**
     * File we're writing journal to.
     * Keep a reference in case we want to rotate it off.
     */
    protected File gzipFile = null;

    /**
     * Create a new crawler journal at the given location
     * 
     * @param path Directory to make thejournal in.
     * @param filename Name to use for journal file.
     * @throws IOException
     */
    public CrawlerJournal(String path, String filename) throws IOException {
        this.gzipFile = new File(path, filename);
        this.out = initialize(gzipFile);
    }

    /**
     * Create a new crawler journal at the given location
     * 
     * @param file path at which to make journal
     * @throws IOException
     */
    public CrawlerJournal(File file) throws IOException {
        this.gzipFile = file;
        this.out = initialize(gzipFile);
    }

    protected Writer initialize(final File f) throws FileNotFoundException, IOException {
        FileUtils.moveAsideIfExists(f);
        return new OutputStreamWriter(
                new GZIPOutputStream(new FastBufferedOutputStream(new FileOutputStream(f), 32 * 1024)));
    }

    /**
     * Write a line
     * 
     * @param string String
     */
    public synchronized void writeLine(String... strs) {
        try {
            for (String s : strs) {
                this.out.write(s);
            }
            this.out.write("\n");
            noteLine();
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "problem writing journal line: " + StringUtils.join(strs), e);
        }
    }

    /**
     * Write a line. 
     * 
     * @param mstring MutableString to write
     */
    public synchronized void writeLine(MutableString mstring) {
        if (this.out == null) {
            return;
        }
        try {
            mstring.write(out);
            this.out.write("\n");
            noteLine();
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "problem writing journal line: " + mstring, e);
        }
    }

    /**
     * Count and note a line
     * 
     * @throws IOException
     */
    protected void noteLine() throws IOException {
        lines++;
        considerTimestamp();
    }

    /**
     * Write a timestamp line if appropriate
     * 
     * @throws IOException
     */
    protected void considerTimestamp() throws IOException {
        if (timestamp_interval > 0 && lines % timestamp_interval == 0) {
            out.write(LOG_TIMESTAMP);
            out.write(ArchiveUtils.getLog14Date());
            out.write("\n");
        }
    }

    /**
     * Flush and close the underlying IO objects.
     */
    public void close() {
        if (this.out == null) {
            return;
        }
        try {
            this.out.flush();
            this.out.close();
            this.out = null;
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "problem closing journal", e);
        }
    }

    /**
     * Note a serious error vioa a special log line
     * 
     * @param err
     */
    public synchronized void seriousError(String err) {
        writeLine(LOG_ERROR + ArchiveUtils.getLog14Date() + " " + err + "\n");
    }

    /**
     * Handle a checkpoint by rotating the current log to a checkpoint-named
     * file and starting a new log. 
     * 
     * @param checkpointDir
     * @throws IOException
     */
    public synchronized void rotateForCheckpoint(Checkpoint checkpointInProgress) {
        if (this.out == null || !this.gzipFile.exists()) {
            return;
        }
        close();

        File newName = new File(this.gzipFile.getParentFile(),
                this.gzipFile.getName() + "." + checkpointInProgress.getName());
        try {
            FileUtils.moveAsideIfExists(newName);
            if (checkpointInProgress.getForgetAllButLatest()) {
                // merge any earlier checkpointed files into new checkpoint
                // file, taking advantage of the legality of concatenating gzips

                File[] oldCheckpointeds = this.gzipFile.getParentFile().listFiles(new FilenameFilter() {
                    @Override
                    public boolean accept(File dir, String name) {
                        String regex = "^" + Pattern.quote(gzipFile.getName()) + "\\.cp\\d{5}-\\d{14}$";
                        return TextUtils.matches(regex, name);
                    }
                });
                Arrays.sort(oldCheckpointeds);

                for (int i = 1; i < oldCheckpointeds.length; i++) {
                    FileUtils.appendTo(oldCheckpointeds[0], oldCheckpointeds[i]);
                    oldCheckpointeds[i].delete();
                }

                if (oldCheckpointeds.length > 0) {
                    FileUtils.appendTo(oldCheckpointeds[0], this.gzipFile);
                    this.gzipFile.delete();
                    oldCheckpointeds[0].renameTo(newName);
                } else {
                    this.gzipFile.renameTo(newName);
                }
            } else {
                this.gzipFile.renameTo(newName);
            }

            // Open new gzip file.
            this.out = initialize(this.gzipFile);
        } catch (IOException ioe) {
            LOGGER.log(Level.SEVERE, "Problem rotating recovery journal", ioe);
        }
    }
}