ch.ethz.epics.export.GoogleImageSitemap.java Source code

Introduction

Here is the source code for ch.ethz.epics.export.GoogleImageSitemap.java
Source

package ch.ethz.epics.export;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;

import ch.ethz.epics.integration.cumulus.CumulusTools;

public class GoogleImageSitemap {

    // Google image sitemap namespaces
    private final static String NS_SITEMAP = "http://www.google.com/schemas/sitemap/0.9";
    private final static String NS_IMAGESITEMAP = "http://www.google.com/schemas/sitemap-images/1.0";

    public final static Namespace nsSitemap = Namespace.getNamespace("", NS_SITEMAP);
    public final static Namespace nsImageSitemap = Namespace.getNamespace("image", NS_SITEMAP);

    protected static Element getSingleNode(Document doc, String strXPath) throws JDOMException {
        XPath xpath = XPath.newInstance(strXPath);
        xpath.addNamespace(XPort.nsEpics);

        List elementList = xpath.selectNodes(doc);

        if (elementList.size() > 0) {
            return (Element) (elementList.iterator().next());
        } else {
            return null;
        }
    }

    public static String getImageSitemapEntry(XPort xport, CumulusTools ct, Log log, String globalId, Document doc,
            String lang) throws JDOMException, IOException {

        Element recordNode = getSingleNode(doc, "//epics:record");
        String recName = recordNode.getAttributeValue("recordName", XPort.nsEpics);

        log.debug("- image sitemap building for internal id='" + recName + "'");
        Document sitemap = new Document();

        // urlset sitemap root
        Element urlset = new Element("urlset", nsSitemap);
        urlset.addNamespaceDeclaration(nsImageSitemap);
        sitemap.setRootElement(urlset);

        Element url = new Element("url", nsSitemap);
        urlset.addContent(url);

        // <loc> tag specifies the URL for an original image on your site
        Element c = new Element("loc", nsSitemap);
        String baseUrl = (String) xport.getParam("index.baseUrl");
        c.setText(baseUrl + "images/" + globalId + ".jpg");
        url.addContent(c);

        /** 
         * The <priority> value specifies the importance of a particular image relative to 
         * other images on the same site
         */
        c = new Element("priority", nsSitemap);
        c.setText("0.5");
        url.addContent(c);

        /**
         * The <lastmod> value identifies the time that the content at the URL was last modified. We recommend 
         * you set this value to the most recent time that the image last changed. This information enables 
         * crawlers to avoid recrawling content that has not changed.
         */
        c = new Element("lastmod", nsSitemap);
        // uses e-pics record modification date
        Element recMod = getSingleNode(doc, "//epics:field[@epics:id='{af4b2e02-5f6a-11d2-8f20-0000c0e166dc}']");
        String strDate = recMod.getChildTextNormalize("content", XPort.nsEpics);

        SimpleDateFormat df = ct.getDefaultDateFormat();
        java.util.Date dtRecMod;
        try {
            dtRecMod = df.parse(strDate);
            SimpleDateFormat w3c = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
            strDate = w3c.format(dtRecMod);
            c.setText(strDate);
            url.addContent(c);
        } catch (ParseException e1) {
            // cannot parse date: log warning, leave away lastmod
            log.warn("can't parse record modification date for " + globalId);
        }

        /** 
         * The <changefreq> value indicates how frequently the content at a particular URL is likely to 
         * change. We recommend you set this value to indicate how frequently the image changes. 
         */
        c = new Element("changefreq", nsSitemap);
        c.setText("monthly");
        url.addContent(c);

        /** 
         * The <expires> tag identifies the time that the content expires. The value of the <expires> tag 
         * should be a timestamp in W3C DATETIME format.
         */
        // e-pics: add 2 years to now
        c = new Element("expires", nsSitemap);
        SimpleDateFormat w3c = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
        Calendar cNow = Calendar.getInstance();
        cNow.add(Calendar.YEAR, 2);
        Date dt = cNow.getTime();
        strDate = w3c.format(dt);
        c.setText(strDate);
        url.addContent(c);

        // image:image
        Element imageTag = new Element("image", nsImageSitemap);
        url.addContent(imageTag);

        // image:landing_page
        Element cLandingPage = new Element("landing_page", nsImageSitemap);
        imageTag.addContent(cLandingPage);

        /**
         * The <image:loc> tag identifies the URL where the user will be directed when clicking on the image
         * from Google search results. Please note that this value must be properly encoded.
         */
        c = new Element("loc", nsImageSitemap);
        baseUrl = (String) xport.getParam("index.baseUrl");
        c.setText(baseUrl + globalId + ".html");
        cLandingPage.addContent(c);

        /**
         * The <image:language> tag identifies the language for each landing page defined by <image:landing_page>
         */
        // e-pics: primarily german content
        c = new Element("language", nsImageSitemap);
        c.setText(lang);
        cLandingPage.addContent(c);

        /**
         * The <image:title> tag specifies the title of the image. There should be a maximum of one title 
         * per landing page.
         */
        Element e = getSingleNode(doc, "//epics:field[@epics:id='{af4b2e3d-5f6a-11d2-8f20-0000c0e166dc}']");
        String title = e.getChildText("content", XPort.nsEpics);
        if (title != null && title.length() > 0) {
            c = new Element("title", nsImageSitemap);
            c.setText(title);
            cLandingPage.addContent(c);
        }

        /**
         * The <image:caption> tag's value provides context for an image. Please note that this value must be XMLencoded.
        * There should be a maximum of one caption per landing page.
         */
        e = getSingleNode(doc, "//epics:field[@epics:id='{af4b2e34-5f6a-11d2-8f20-0000c0e166dc}']");
        String caption = e.getChildText("content", XPort.nsEpics);
        if (caption != null && caption.length() > 0) {
            c = new Element("caption", nsImageSitemap);
            c.setText(caption);
            cLandingPage.addContent(c);
        }

        /**
         * The <image:category> tag's value specifies one or more groups, subjects or categories that 
         * describe the image. Multiple categories should be included in separate category tags. 
         * Comma-separated values within a single category tag will be considered one single category.
         */
        //ArrayList<String> keywords = new ArrayList<String>();
        HashSet<String> keywords = new HashSet<String>();
        XPath xpath = XPath.newInstance("//epics:category");
        xpath.addNamespace(XPort.nsEpics);
        List catNodes = xpath.selectNodes(doc);
        Iterator it = catNodes.iterator();
        String locationName = "";
        while (it.hasNext()) {
            e = (Element) it.next();
            String catName = e.getTextNormalize();

            log.debug("catName: " + catName);

            // KJ/20081203: use all keywords
            /* last keyword:
            String kw = catName.replaceAll(".* >? (.*)", "$1");
            log.debug("got keyword: "+kw);
            if (kw != null && kw.length() > 0) {
              keywords.addAll(Arrays.asList(kw.split(", ")));
            }
            */
            /*
            if (catName != null && catName.length() > 0) {
               keywords.addAll(Arrays.asList(catName.split("> ")));
               log.debug("got keywords: "+keywords);
            }
            */

            // add keywords with duplicates removed
            if (catName != null && catName.length() > 0) {

                List<String> kw = Arrays.asList(catName.split(">"));

                for (String k : kw) {

                    if (!keywords.contains(k) && k.trim().length() > 1) {
                        log.debug("got keyword: " + k);
                        keywords.add(k.trim());
                    }
                }

            }

            String nCatName = catName.replaceAll("\\s*>", ", ");
            if (nCatName != null && nCatName.length() > 0) {
                // KJ/20081203: don't add categories - use keywords instead

                //c = new Element("category", nsImageSitemap);
                //c.setText(nCatName);                
                //cLandingPage.addContent(c);   

                // E-Pics ETHBIB.Bildarchiv *spezial*
                if (catName.contains("Politische Geographie")) {
                    log.debug("found location = " + catName);

                    catName = catName.replaceAll(", Kanton > ", ", ");
                    catName = catName.replaceAll(", Stadt > ", ", ");

                    locationName = catName.replaceAll(".*?Politische Geographie\\s*>*(.*)", "$1");

                    log.debug("reduced location to = " + locationName);
                    locationName = locationName.replaceAll("\\s*>", ",");
                    log.debug("reduced location to = " + locationName);
                }
            }

            /*
            if (nCatName.contains("Sachkatalog")) {
               String kwlist = catName.replaceAll(".*?Sachkatalog\\s*>*(.*)", "$1");
               if (kwlist != null && kwlist.length() > 0) {
             keywords.addAll(Arrays.asList(kwlist.split(", ")));
               }
            }
            */

        }

        /**
         * The <image:keyword> tag contains a single keyword that describes an image. By properly tagging 
         * images, you will help us to rank them in the Google Image Index. Please provide keywords that 
         * are as specific and descriptive as possible. Broad keywords may or may not be used in indexing. 
         * Keywords should be included in separate keyword tags, and comma-separated values within a single 
         * keyword tag will be considered one single keyword.
         */
        it = keywords.iterator();
        while (it.hasNext()) {
            String kw = (String) it.next();

            c = new Element("keyword", nsImageSitemap);
            c.setText(kw);
            cLandingPage.addContent(c);
        }

        /**
         * The <image:family_friendly> tag's value indicates whether the image only contains content that 
         * is suitable for children. Acceptable values for this tag are yes and no. Please use reasonable 
         * judgment when determining values for this tag. One way to define family-friendly is whether 
         * the image could appear in a G-rated movie.
         */
        c = new Element("family_friendly", nsImageSitemap);
        //TODO
        //e = (Element)xpath.selectSingleNode("//epics:field[@epics:id='{af4b2e34-5f6a-11d2-8f20-0000c0e166dc}']");
        //String familyFriendly = e.getChildText("epics:content");
        c.setText("yes");
        imageTag.addContent(c);

        /**
         * The <image:geo_location> tag is used to specify a geographical location. This can be a string 
         * the form of an address, city name, or latitude and longitude pair. Please note that this value 
         * must be XML-encoded.
         */
        c = new Element("geo_location", nsImageSitemap);
        if (locationName != null) {
            c.setText(locationName);
            imageTag.addContent(c);
        }

        /**
         * The <image:geo_location> tag is used to specify a geographical location. This can be a string 
         * the form of an address, city name, or latitude and longitude pair. Please note that this value 
         * must be XML-encoded.
         */

        // E-Pics: Creative Commons Namensnennung, Non-commercial, no derivatives
        c = new Element("license", nsImageSitemap);
        c.setText("http://creativecommons.org/licenses/by-nc-nd/2.5/ch/");
        imageTag.addContent(c);

        /**
         * The <image:quality> tag's value specifies the quality of the image relative to other images. 
         * This information may be used to rank images from the same site relative to one another on 
         * search result pages. Unlike <priority>, it is not used to prioritize images indexed by Google.
         */
        // E-Pics: not used
        //c = new Element("quality", nsImageSitemap);
        //c.setText("1.0");                
        //imageTag.addContent(c);            

        /**
         * The <image:publication_date> tag identifies the original publication date of the image in YYYY-MM-DD format.
        * The value of the <image:publication_date> tag should be a timestamp in W3C DATETIME format.
         */
        // E-Pics: EXIF Date created -or- TODO: Date field
        c = new Element("publication_date", nsImageSitemap);
        Element pubDate = getSingleNode(doc, "//epics:field[@epics:id='{af4b2e51-5f6a-11d2-8f20-0000c0e166dc}']");
        if (pubDate != null) {
            strDate = pubDate.getChildTextNormalize("content", XPort.nsEpics);
        } else {
            // try to get date field
            Element dateField = getSingleNode(doc,
                    "//epics:field[@epics:id='{132267c2-4148-4b76-b851-88409d7d2799}']");
            if (dateField != null) {
                strDate = dateField.getChildTextNormalize("content", XPort.nsEpics);
                if (strDate != null && strDate.length() > 0) {
                    if (!strDate.contains(".") && !strDate.contains("/")) {
                        // year only
                        strDate = "01.01." + strDate; //+" 00:00:00 CET";
                    } else if (strDate.contains("/") && strDate.length() > 0) {
                        // mm/YYYY
                        String[] dateDetails = strDate.split("\\/");
                        if (dateDetails.length == 2) {
                            strDate = "01." + dateDetails[0] + "." + dateDetails[1];
                            //strDate += " 00:00:00 CET";
                        }
                    } else {
                        //strDate = strDate+" 00:00:00 CET";
                    }
                    log.debug("converted dateonly value = " + strDate);
                }
            }
        }
        if (strDate != null && strDate.length() > 0) {
            df = ct.getDefaultDateFormat();
            java.util.Date dtPubDate;
            try {
                dtPubDate = df.parse(strDate);
                w3c = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
                strDate = w3c.format(dtPubDate);
                c.setText(strDate);

                // KJ: 20090619 - Google does not like publication dates < 1970
                // therefore they are removed from the sitemap
                //imageTag.addContent(c);

            } catch (ParseException e1) {
                // cannot parse date: log warning, leave away lastmod
                log.warn("can't parse publication date for " + globalId, e1);
            }
        }

        /**
         * The <image:size> tag specifies the size of the image in pixels. The images that you make 
         * available to Google's crawlers should be the same size as the images that you display to 
         * users on your site.
         */
        // E-Pics: will need size of derivative image
        Element mediumImage = getSingleNode(doc, "//epics:field[@epics:fieldName='Image']");
        String finalSize = mediumImage.getAttributeValue("size", XPort.nsEpics);
        if (finalSize != null) {
            c = new Element("size", nsImageSitemap);
            c.setText(finalSize);
            imageTag.addContent(c);
        }

        /**
         * The <image:watermarking> tag's value indicates whether watermarking exists on the image. The 
         * only valid values of this tag are yes and no. If the value is yes, then you have the option 
         * of specifying the percentage attribute:
         */
        // E-Pics: maximum of 5% for creative commons license
        c = new Element("watermarking", nsImageSitemap);
        c.setText("yes");
        c.setAttribute("percentage", "5");
        imageTag.addContent(c);

        // write XML fragment into string
        StringWriter sw = new StringWriter();
        Format xmlFormatDebug = Format.getPrettyFormat().setOmitDeclaration(true);
        xmlFormatDebug.setEncoding("UTF-8");
        XMLOutputter xmlOutput = new XMLOutputter(xmlFormatDebug);
        Element sitemapRoot = sitemap.getRootElement();
        xmlOutput.output((Element) sitemapRoot.getContent(0), sw);

        String part = sw.toString();
        part = part.replaceAll(" xmlns=\"http://www.google.com/schemas/sitemap/0.9\"", "");
        part = part.replaceAll(" xmlns\\:image=\"http://www.google.com/schemas/sitemap/0.9\"", "");

        return part;
    }

    /**
     * Creates a new google image sitemap. 
     * @return
     * @throws Exception 
     */
    public static void writeHeader(OutputStreamWriter out) throws Exception {
        String header = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n";
        header += "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.9\"\n";
        header += "        xmlns:image=\"http://www.google.com/schemas/sitemap-images/1.0\">\n";

        out.write(header);
    }

    /**
     * Writes footer and closes sitemap file.
     */
    public static void writeFooter(OutputStreamWriter out) throws Exception {
        String footer = "\n</urlset>";
        out.write(footer);
        out.flush();
    }

}