org.carrot2.core.benchmarks.memtime.MemTimeBenchmark.java Source code

Introduction

Here is the source code for org.carrot2.core.benchmarks.memtime.MemTimeBenchmark.java
Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.core.benchmarks.memtime;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryPoolMXBean;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IProcessingComponent;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.util.CloseableUtils;
import org.junit.BeforeClass;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.Root;
import org.simpleframework.xml.core.Persister;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Maps;

/**
 * Compute approximate memory and time characteristic for a given algorithm and input.
 */
@SuppressWarnings("unused")
public class MemTimeBenchmark {
    /**
     * Due to class renames (LanguageCode changed its package) the logged XML files don't
     * deserialize properly. This is a stub wrapper that only reads documents.
     */
    @Root(name = "searchresult", strict = false)
    public static class ResponseWrapper {
        @ElementList(inline = true, required = false)
        public List<Document> documents;
    }

    /**
     * A list of input documents. These documents should be some real-life input snippets
     * (preferably short, because longer input is simulated by concatenating documents
     * together).
     */
    protected static ArrayList<Document> documents = new ArrayList<Document>();

    /**
     * Directory with input XML files. Files should be organized using file name
     * convention:
     * 
     * <pre>
     * response - xxxxx.xml
     * </pre>
     * 
     * where <code>xxxxx</code> is a sequential number starting from 0.
     */
    private static File inputFilesDir;

    /**
     * Folder for the output log files.
     */
    private static File outputFilesDir;

    /**
     * Maximum number of input files to read from disk.
     */
    private static int MAX_FILES;

    /**
     * Minimum documents to cluster.
     */
    protected static int MIN;

    /**
     * Maximum documents to cluster.
     */
    protected static int MAX;

    /**
     * Increment step for the documents to cluster range.
     */
    protected static int STEP;

    /**
     * The controller used to drive the clustering process.
     */
    private static Controller controller;

    /**
     * Override defaults with system properties.
     */
    public static void overrideDefaults() {
        inputFilesDir = new File(System.getProperty("inputFilesDir", "input"));
        outputFilesDir = new File(System.getProperty("outputFilesDir", "tmp"));
        MAX_FILES = Integer.parseInt(System.getProperty("MAX_FILES", "200"));
        MIN = Integer.parseInt(System.getProperty("MIN", "100"));
        MAX = Integer.parseInt(System.getProperty("MAX", "20000"));
        STEP = Integer.parseInt(System.getProperty("STEP", "100"));
    }

    /**
     * Populate {@link #documents}.
     */
    public static void readData() throws Exception {
        Persister p = new Persister();
        for (int i = 0; i < MAX_FILES; i++) {
            String fileName = String.format("response-%05d.xml", i);
            ResponseWrapper w = p.read(ResponseWrapper.class, new File(inputFilesDir, fileName));

            if (w.documents == null)
                continue;
            for (Document d : w.documents) {
                documents.add(d);
            }
        }
    }

    /**
     * Dump JVM info to the output folder.
     */
    private static void dumpJVMInfo() throws Exception {
        String[] properties = { "java.runtime.name", "java.vm.version", "java.vm.vendor", "java.vm.name",
                "java.vm.specification.name", "java.runtime.version", "os.arch", "java.vm.specification.vendor",
                "os.name", "java.specification.name", "sun.management.compiler", "os.version",
                "java.specification.version", "java.vm.specification.version", "sun.arch.data.model",
                "java.specification.vendor", "java.vm.info", "java.version", "java.vendor", "sun.cpu.isalist", };
        Arrays.sort(properties);

        File output = new File(outputFilesDir, "jvm.log");
        Writer w = null;
        try {
            w = new OutputStreamWriter(new FileOutputStream(output), "UTF-8");
            w.write("Benchmark executed at: " + new Date() + "\n\n");

            for (String prop : properties) {
                w.write(prop + "=" + System.getProperty(prop, "n/a") + "\n");
            }
            w.write("processors=" + Runtime.getRuntime().availableProcessors() + "\n");
            w.write("\n");
        } finally {
            CloseableUtils.close(w);
        }
    }

    /**
     * Initialize static data.
     */
    @BeforeClass
    public static void initStaticData() throws Exception {
        overrideDefaults();
        readData();
        dumpJVMInfo();
        controller = ControllerFactory.createPooling();
    }

    /**
     * Perform the time/memory evaluation for a single algorithm.
     */
    protected void evalShortDocs(String resultPrefix, Class<? extends IProcessingComponent> algorithm, int MIN,
            int MAX, int STEP) {
        final Logger logger = LoggerFactory.getLogger(resultPrefix);

        File output = new File(outputFilesDir, resultPrefix + ".log");
        Writer w = null;
        int docs = 0;
        try {
            w = new OutputStreamWriter(new FileOutputStream(output), "UTF-8");

            String header = "docs size[MB] time[s] mem[MB]";
            w.write(header + "\n");
            logger.info(header);

            for (docs = MIN; docs < Math.min(MAX + 1, documents.size()); docs += STEP) {
                memClean();
                memPeak();

                final long start = now();
                final HashMap<String, Object> attributes = Maps.newHashMap();
                final List<Document> inputList = documents.subList(0, Math.min(docs, documents.size()));
                attributes.put(AttributeNames.DOCUMENTS, inputList);

                // luceneIndex(inputList);
                controller.process(attributes, algorithm);
                final long end = now();

                final double memUsedMB = memPeak() / (1024 * 1024.0);
                final double timeSecs = (end - start) / 1000.0;
                final double mbLength = countByteLength(inputList) / (1024 * 1024.0);
                final int docsCount = inputList.size();

                final String logLine = String.format(Locale.ENGLISH, "%d %.2f %.2f %.2f", docsCount, mbLength,
                        timeSecs, memUsedMB);

                logger.info(logLine);
                w.write(logLine + "\n");
                w.flush();
            }
        } catch (OutOfMemoryError e) {
            logger.warn("OOM at: " + docs);
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            CloseableUtils.close(w);
        }
    }

    /**
     * Index documents in-memory using Lucene.
     */
    private void luceneIndex(List<Document> inputList) {
        try {
            Directory dir = new RAMDirectory();
            IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
            IndexWriter w = new IndexWriter(dir, config);

            for (Document d : inputList) {
                final org.apache.lucene.document.Document nd = new org.apache.lucene.document.Document();
                nd.add(new TextField("title", StringUtils.defaultIfEmpty(d.getTitle(), ""), Store.NO));
                nd.add(new TextField("snippet", StringUtils.defaultIfEmpty(d.getSummary(), ""), Store.NO));
                w.addDocument(nd);
            }

            w.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Count the overall length of the input (titles and snippets). The length is
     * expressed in characters.
     */
    private static long countByteLength(List<Document> inputList) {
        long length = 0;
        for (Document d : inputList) {
            if (d.getTitle() != null)
                length += d.getTitle().length();
            if (d.getSummary() != null)
                length += d.getSummary().length();
        }
        return length;
    }

    /**
     * @return Return {@link System#currentTimeMillis()}.
     */
    private static long now() {
        return System.currentTimeMillis();
    }

    /**
     * Best-effort attempt to force {@link System#gc()}.
     */
    private static void memClean() {
        System.gc();
        System.gc();
        Thread.yield();
    }

    /**
     * Return the peak number of bytes used (all memory pools) and reset the peak usage.
     */
    private static long memPeak() {
        long peak = 0;
        for (MemoryPoolMXBean b : ManagementFactory.getMemoryPoolMXBeans()) {
            peak += b.getPeakUsage().getUsed();
            b.resetPeakUsage();
        }

        return peak;
    }
}