org.mycore.frontend.indexbrowser.MCRGoogleSitemapCommon.java Source code

Java tutorial

Introduction

Here is the source code for org.mycore.frontend.indexbrowser.MCRGoogleSitemapCommon.java

Source

/*
 * 
 * $Revision$ $Date$
 *
 * This file is part of ***  M y C o R e  ***
 * See http://www.mycore.de/ for details.
 *
 * This program is free software; you can use it, redistribute it
 * and / or modify it under the terms of the GNU General Public License
 * (GPL) as published by the Free Software Foundation; either version 2
 * of the License or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program, in a file called gpl.txt or license.txt.
 * If not, write to the Free Software Foundation Inc.,
 * 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
 */

package org.mycore.frontend.indexbrowser;

import java.io.File;
import java.io.IOException;
import java.nio.file.NotDirectoryException;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.GregorianCalendar;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.TimeZone;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.mycore.common.config.MCRConfiguration;
import org.mycore.datamodel.common.MCRObjectIDDate;
import org.mycore.datamodel.common.MCRXMLMetadataManager;
import org.mycore.datamodel.ifs2.MCRObjectIDFileSystemDate;
import org.mycore.datamodel.ifs2.MCRStoredMetadata;
import org.mycore.datamodel.metadata.MCRObjectID;

/**
 * This class implements all common methods to create the sitemap data.
 * <br>
 * used properties:
 * <br>
 * <ul>
 * <li>MCR.baseurl - the application base URL</li>
 * <li>MCR.WebApplication.basedir - the directory where the web application is stored</li>
 * <li>MCR.GoogleSitemap.Directory - the directory where the sitemap should be stored relative to MCR.WebApplication.basedir (it could be empty)</li>
 * <li>MCR.GoogleSitemap.Types - a list of MCRObject types, they should be included</li>
 * <li>MCR.GoogleSitemap.Freq - the frequency of harvesting, 'monthly' is default<li>
 * <li>MCR.GoogleSitemap.Style - a style extension for the URL in form of ?XSL.Style={style}, default is empty</li>
 * <li>MCR.GoogleSitemap.ObjectPath - the path to get the MCRObject in the sitemap URL, 'receive/' is default</li>
 * <li>MCR.GoogleSitemap.NumberOfURLs - the number of URLs in one sitemap file, 10000 is default</li>
 * </ul>
 * 
 * see http://www.sitemaps.org/de/protocol.html
 * 
 * @author Frank Ltzenkirchen
 * @author Jens Kupferschmidt
 * @author Thomas Scheffler (yagee)
 * @version $Revision$ $Date$
 * 
 */
public final class MCRGoogleSitemapCommon {

    /** The logger */
    private static Logger LOGGER = LogManager.getLogger(MCRGoogleSitemapCommon.class.getName());

    /** Zone information **/
    private static final Locale SITEMAP_LOCALE = Locale.ROOT;

    private static final TimeZone SITEMAP_TIMEZONE = TimeZone.getTimeZone("UTC");

    /** The namespaces */
    private static final Namespace ns = Namespace.getNamespace("http://www.sitemaps.org/schemas/sitemap/0.9");

    private final static String XSI_URL = "http://www.w3.org/2001/XMLSchema-instance";

    private final static Namespace XSI_NAMESPACE = Namespace.getNamespace("xsi", XSI_URL);

    private final static String SITEINDEX_SCHEMA = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd";

    private final static String SITEMAP_SCHEMA = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd";

    /** The base URL */
    private String baseurl = MCRConfiguration.instance().getString("MCR.baseurl", "");

    /** The webapps directory path from configuration */
    private final File webappBaseDir;

    /** The directory path to store sitemaps relative to MCR.WebApplication.basedir */
    private static final String cdir = MCRConfiguration.instance().getString("MCR.GoogleSitemap.Directory", "");

    /** The types to build sitemaps */
    private static final String[] types = MCRConfiguration.instance()
            .getString("MCR.GoogleSitemap.Types", "document").split(",");

    /** The frequence of crawle by Google */
    private static final String freq = MCRConfiguration.instance().getString("MCR.GoogleSitemap.Freq", "monthly");

    /** The style for by Google link */
    private static final String style = MCRConfiguration.instance().getString("MCR.GoogleSitemap.Style", "");

    /** The url path for retrieving object metadata */
    private static final String objectPath = MCRConfiguration.instance().getString("MCR.GoogleSitemap.ObjectPath",
            "receive/");

    /** Number of URLs in one sitemap */
    private static int numberOfURLs = MCRConfiguration.instance().getInt("MCR.GoogleSitemap.NumberOfURLs", 10000);

    /** The XML table API */
    private static final MCRXMLMetadataManager tm = MCRXMLMetadataManager.instance();

    /** number format for parts */
    private static NumberFormat number_format = getNumberFormat();

    /** date formatter */
    private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd", SITEMAP_LOCALE);

    /** local data */
    private List<MCRObjectIDDate> objidlist = null;

    /** The constructor 
     * @throws NotDirectoryException */
    public MCRGoogleSitemapCommon(File baseDir) throws NotDirectoryException {
        if (!Objects.requireNonNull(baseDir, "baseDir may not be null.").isDirectory()) {
            throw new NotDirectoryException(baseDir.getAbsolutePath());
        }
        this.webappBaseDir = baseDir;
        LOGGER.info("Using webappbaseDir: " + baseDir.getAbsolutePath());
        objidlist = new ArrayList<MCRObjectIDDate>();
        if ((numberOfURLs < 1) || (numberOfURLs > 50000))
            numberOfURLs = 50000;
        if (cdir.length() != 0) {
            File sitemap_directory = new File(webappBaseDir, cdir);
            if (!sitemap_directory.exists()) {
                sitemap_directory.mkdirs();
            }
        }
    }

    private static NumberFormat getNumberFormat() {
        NumberFormat nf = NumberFormat.getIntegerInstance(SITEMAP_LOCALE);
        nf.setMinimumFractionDigits(5);
        return nf;
    }

    public MCRGoogleSitemapCommon(String baseURL, File baseDir) throws NotDirectoryException {
        this(baseDir);
        this.baseurl = baseURL;
    }

    /**
     * The method computes the number of sitemap files. If we have less than
     * <em>numberOfURLs</em> URLs and only one MyCoRe type the sitemap_google.xml
     * contained all URLs. Otherwise it split the sitemap in an sitemap_google.xml
     * index file and a lot of sitemap_google_xxxx.xml URL files.
     * 
     * @return the number of files, one for a single sitemap_google.xml file, more than
     *         one for the index and all parts.
     */
    protected final int checkSitemapFile() throws IOException {
        int number = 0;
        for (String type : types) {
            List<String> ids = tm.listIDsOfType(type);
            for (String id : ids) {
                MCRObjectID mcrid = MCRObjectID.getInstance(id);
                MCRStoredMetadata sm = tm.getStore(mcrid).retrieve(mcrid.getNumberAsInteger());
                objidlist.add(new MCRObjectIDFileSystemDate(sm, id));
            }
        }
        number = objidlist.size() / numberOfURLs;
        if (objidlist.size() % numberOfURLs != 0)
            number++;
        return number;
    }

    /**
     * The method return the path to the sitemap_google.xml file.
     * 
     * @param number
     *            number of this file - '1' = sitemap_google.xml - '&gt; 1' sitemap_google_xxx.xml
     * @param withpath
     *            true for the full path, false for the file name
     * @return a path to sitemap_google.xml
     */
    protected final String getFileName(int number, boolean withpath) {
        String fn = "sitemap_google.xml";
        if (number > 1) {
            fn = "sitemap_google_" + number_format.format(number - 1) + ".xml";
        }
        String local_path = fn;
        if (cdir.length() != 0) {
            local_path = cdir + File.separator + fn;
        }
        if (withpath)
            return webappBaseDir + File.separator + local_path;
        return local_path;
    }

    /**
     * The method build the sitemap_google.xml JDOM document over all items.
     * 
     * @return The sitemap_google.xml as JDOM document
     */
    protected final Document buildSingleSitemap() throws Exception {
        LOGGER.debug("Build Google URL sitemap_google.xml for whole items.");
        // build document frame
        Element urlset = new Element("urlset", ns);
        urlset.addNamespaceDeclaration(XSI_NAMESPACE);
        urlset.setAttribute("noNamespaceSchemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
        Document jdom = new Document(urlset);
        // build over all types
        for (MCRObjectIDDate objectIDDate : objidlist) {
            urlset.addContent(buildURLElement(objectIDDate));
        }
        return jdom;
    }

    /**
     * The method call the database and build the sitemap_google.xml JDOM document.
     * 
     * @param number
     *            number of this file - '1' = sitemap_google.xml - '&gt; 1' sitemap_google_xxx.xml
     * @return The sitemap.xml as JDOM document
     */
    protected final Document buildPartSitemap(int number) throws Exception {
        LOGGER.debug("Build Google URL sitemap list number " + Integer.toString(number));
        // build document frame
        Element urlset = new Element("urlset", ns);
        urlset.addNamespaceDeclaration(XSI_NAMESPACE);
        urlset.setAttribute("schemaLocation", SITEMAP_SCHEMA, XSI_NAMESPACE);
        Document jdom = new Document(urlset);

        // build over all types
        int start = numberOfURLs * (number);
        int stop = numberOfURLs * (number + 1);
        if (stop > objidlist.size())
            stop = objidlist.size();
        LOGGER.debug("Build Google URL in range from " + Integer.toString(start) + " to "
                + Integer.toString(stop - 1) + ".");
        for (int i = start; i < stop; i++) {
            MCRObjectIDDate objectIDDate = objidlist.get(i);
            urlset.addContent(buildURLElement(objectIDDate));

        }
        return jdom;
    }

    private Element buildURLElement(MCRObjectIDDate objectIDDate) {
        String mcrID = objectIDDate.getId();
        StringBuilder sb = new StringBuilder(1024);
        sb.append(baseurl).append(objectPath).append(mcrID);
        if ((style != null) && (style.trim().length() > 0)) {
            sb.append("?XSL.Style=").append(style);
        }
        // build entry
        Element url = new Element("url", ns);
        url.addContent(new Element("loc", ns).addContent(sb.toString()));
        String datestr = formatter.format(objectIDDate.getLastModified());
        url.addContent(new Element("lastmod", ns).addContent(datestr));
        url.addContent(new Element("changefreq", ns).addContent(freq));
        return url;
    }

    /**
     * The method build the index sitemap_google.xml JDOM document.
     * 
     * @param number
     *            number of indexed files (must greater than 1
     * @return The index sitemap_google.xml as JDOM document
     */
    protected final Document buildSitemapIndex(int number) {
        LOGGER.debug("Build Google sitemap number " + Integer.toString(number));
        // build document frame
        Element index = new Element("sitemapindex", ns);
        index.addNamespaceDeclaration(XSI_NAMESPACE);
        index.setAttribute("schemaLocation", SITEINDEX_SCHEMA, XSI_NAMESPACE);
        Document jdom = new Document(index);
        // build over all files
        for (int i = 0; i < number; i++) {
            Element sitemap = new Element("sitemap", ns);
            index.addContent(sitemap);
            StringBuilder sb = new StringBuilder(128);
            sb.append(baseurl).append(getFileName(i + 2, false));
            sitemap.addContent(new Element("loc", ns).addContent(sb.toString().trim()));
            String datestr = formatter.format((new GregorianCalendar(SITEMAP_TIMEZONE, SITEMAP_LOCALE)).getTime());
            sitemap.addContent(new Element("lastmod", ns).addContent(datestr.trim()));
        }
        return jdom;
    }

    /**
     * This method remove all sitemap files from the webapps directory.
     */
    protected final void removeSitemapFiles() {
        File dir = new File(webappBaseDir, cdir);
        File[] li = dir.listFiles();
        if (li != null) {
            for (File fi : li) {
                if (fi.getName().startsWith("sitemap_google")) {
                    LOGGER.debug("Remove file " + fi.getName());
                    fi.delete();
                }
            }
        }
    }
}