hivemall.docs.FuncsListGenerator.java Source code

Java tutorial

Introduction

Here is the source code for hivemall.docs.FuncsListGenerator.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package hivemall.docs;

import static hivemall.docs.utils.MarkdownUtils.asCodeBlock;
import static hivemall.docs.utils.MarkdownUtils.asInlineCode;
import static hivemall.docs.utils.MarkdownUtils.asListElement;
import static hivemall.docs.utils.MarkdownUtils.indent;
import static org.apache.commons.lang.StringEscapeUtils.escapeHtml;

import hivemall.utils.lang.StringUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nonnull;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.maven.execution.MavenSession;
import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugins.annotations.Mojo;
import org.apache.maven.plugins.annotations.Parameter;
import org.reflections.Reflections;

/**
 * Generate a list of UDFs for documentation.
 *
 * @link https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html
 * @link https://hivemall.incubator.apache.org/userguide/misc/funcs.html
 */
@Mojo(name = "generate-funcs-list")
public class FuncsListGenerator extends AbstractMojo {

    @Parameter(defaultValue = "${basedir}", readonly = true)
    private File basedir;

    @Parameter(defaultValue = "${session}", readonly = true)
    private MavenSession session;

    @Parameter(defaultValue = "docs/gitbook/misc/generic_funcs.md")
    private String pathToGenericFuncs;

    @Parameter(defaultValue = "docs/gitbook/misc/funcs.md")
    private String pathToFuncs;

    private static final Map<String, List<String>> genericFuncsHeaders = new LinkedHashMap<>();
    static {
        genericFuncsHeaders.put("# Array", Arrays.asList("hivemall.tools.array", "hivemall.tools.list"));
        genericFuncsHeaders.put("# Map", Collections.singletonList("hivemall.tools.map"));
        genericFuncsHeaders.put("# Bitset", Collections.singletonList("hivemall.tools.bits"));
        genericFuncsHeaders.put("# Compression", Collections.singletonList("hivemall.tools.compress"));
        genericFuncsHeaders.put("# MapReduce", Collections.singletonList("hivemall.tools.mapred"));
        genericFuncsHeaders.put("# Math", Collections.singletonList("hivemall.tools.math"));
        genericFuncsHeaders.put("# Matrix", Collections.singletonList("hivemall.tools.matrix"));
        genericFuncsHeaders.put("# Text processing", Collections.singletonList("hivemall.tools.text"));
        genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools"));
    }

    private static final Map<String, List<String>> funcsHeaders = new LinkedHashMap<>();
    static {
        funcsHeaders.put("# Regression", Collections.singletonList("hivemall.regression"));
        funcsHeaders.put("# Classification", Collections.<String>emptyList());
        funcsHeaders.put("## Binary classification", Collections.singletonList("hivemall.classifier"));
        funcsHeaders.put("## Multiclass classification",
                Collections.singletonList("hivemall.classifier.multiclass"));
        funcsHeaders.put("# Matrix factorization", Collections.singletonList("hivemall.mf"));
        funcsHeaders.put("# Factorization machines", Collections.singletonList("hivemall.fm"));
        funcsHeaders.put("# Recommendation", Collections.singletonList("hivemall.recommend"));
        funcsHeaders.put("# Anomaly detection", Collections.singletonList("hivemall.anomaly"));
        funcsHeaders.put("# Topic modeling", Collections.singletonList("hivemall.topicmodel"));
        funcsHeaders.put("# Preprocessing", Collections.singletonList("hivemall.ftvec"));
        funcsHeaders.put("## Data amplification", Collections.singletonList("hivemall.ftvec.amplify"));
        funcsHeaders.put("## Feature binning", Collections.singletonList("hivemall.ftvec.binning"));
        funcsHeaders.put("## Feature format conversion", Collections.singletonList("hivemall.ftvec.conv"));
        funcsHeaders.put("## Feature hashing", Collections.singletonList("hivemall.ftvec.hashing"));
        funcsHeaders.put("## Feature paring", Collections.singletonList("hivemall.ftvec.pairing"));
        funcsHeaders.put("## Ranking", Collections.singletonList("hivemall.ftvec.ranking"));
        funcsHeaders.put("## Feature scaling", Collections.singletonList("hivemall.ftvec.scaling"));
        funcsHeaders.put("## Feature selection", Collections.singletonList("hivemall.ftvec.selection"));
        funcsHeaders.put("## Feature transformation and vectorization",
                Collections.singletonList("hivemall.ftvec.trans"));
        funcsHeaders.put("# Geospatial functions", Collections.singletonList("hivemall.geospatial"));
        funcsHeaders.put("# Distance measures", Collections.singletonList("hivemall.knn.distance"));
        funcsHeaders.put("# Locality-sensitive hashing", Collections.singletonList("hivemall.knn.lsh"));
        funcsHeaders.put("# Similarity measures", Collections.singletonList("hivemall.knn.similarity"));
        funcsHeaders.put("# Evaluation", Collections.singletonList("hivemall.evaluation"));
        funcsHeaders.put("# Sketching", Collections.singletonList("hivemall.sketch.hll"));
        funcsHeaders.put("# Ensemble learning", Collections.singletonList("hivemall.ensemble"));
        funcsHeaders.put("## Bagging", Collections.singletonList("hivemall.ensemble.bagging"));
        funcsHeaders.put("# Decision trees and RandomForest", Arrays.asList("hivemall.smile.classification",
                "hivemall.smile.regression", "hivemall.smile.tools"));
        funcsHeaders.put("# XGBoost", Arrays.asList("hivemall.xgboost.classification",
                "hivemall.xgboost.regression", "hivemall.xgboost.tools"));
        funcsHeaders.put("# Others", Arrays.asList("hivemall", "hivemall.dataset", "hivemall.ftvec.text"));
    }

    @Override
    public void execute() throws MojoExecutionException {
        if (!isReactorRootProject()) {
            // output only once across the projects
            return;
        }

        generate(new File(basedir, pathToGenericFuncs),
                "This page describes a list of useful Hivemall generic functions. See also a [list of machine-learning-related functions](./funcs.md).",
                genericFuncsHeaders);
        generate(new File(basedir, pathToFuncs),
                "This page describes a list of Hivemall functions. See also a [list of generic Hivemall functions](./generic_funcs.md) for more general-purpose functions such as array and map UDFs.",
                funcsHeaders);
    }

    private boolean isReactorRootProject() {
        return session.getExecutionRootDirectory().equalsIgnoreCase(basedir.toString());
    }

    private void generate(@Nonnull File outputFile, @Nonnull String preface,
            @Nonnull Map<String, List<String>> headers) throws MojoExecutionException {
        Reflections reflections = new Reflections("hivemall");
        Set<Class<?>> annotatedClasses = reflections.getTypesAnnotatedWith(Description.class);

        StringBuilder sb = new StringBuilder();
        Map<String, Set<String>> packages = new HashMap<>();

        Pattern func = Pattern.compile("_FUNC_(\\(.*?\\))(.*)", Pattern.DOTALL);

        for (Class<?> annotatedClass : annotatedClasses) {
            Deprecated deprecated = annotatedClass.getAnnotation(Deprecated.class);
            if (deprecated != null) {
                continue;
            }

            Description description = annotatedClass.getAnnotation(Description.class);

            String value = description.value().replaceAll("\n", " ");
            Matcher matcher = func.matcher(value);
            if (matcher.find()) {
                value = asInlineCode(description.name() + matcher.group(1)) + escapeHtml(matcher.group(2));
            }
            sb.append(asListElement(value));

            StringBuilder sbExtended = new StringBuilder();
            if (!description.extended().isEmpty()) {
                sbExtended.append(description.extended());
                sb.append("\n");
            }

            String extended = sbExtended.toString();
            if (extended.isEmpty()) {
                sb.append("\n");
            } else {
                if (extended.toLowerCase().contains("select")) { // extended description contains SQL statements
                    sb.append(indent(asCodeBlock(extended, "sql")));
                } else {
                    sb.append(indent(asCodeBlock(extended)));
                }
            }

            String packageName = annotatedClass.getPackage().getName();
            if (!packages.containsKey(packageName)) {
                Set<String> set = new TreeSet<>();
                packages.put(packageName, set);
            }
            Set<String> List = packages.get(packageName);
            List.add(sb.toString());

            StringUtils.clear(sb);
        }

        try (PrintWriter writer = new PrintWriter(outputFile)) {
            // license header
            writer.println("<!--");
            try {
                File licenseFile = new File(basedir, "resources/license-header.txt");
                FileReader fileReader = new FileReader(licenseFile);

                try (BufferedReader bufferedReader = new BufferedReader(fileReader)) {
                    String line;
                    while ((line = bufferedReader.readLine()) != null) {
                        writer.println(indent(line));
                    }
                }
            } catch (IOException e) {
                throw new MojoExecutionException("Failed to read license file");
            }
            writer.println("-->\n");

            writer.println(preface);

            writer.println("\n<!-- toc -->\n");

            for (Map.Entry<String, List<String>> e : headers.entrySet()) {
                writer.println(e.getKey() + "\n");
                List<String> packageNames = e.getValue();
                for (String packageName : packageNames) {
                    for (String desc : packages.get(packageName)) {
                        writer.println(desc);
                    }
                }
            }

            writer.flush();
        } catch (FileNotFoundException e) {
            throw new MojoExecutionException("Output file is not found");
        }
    }
}