 * Copyright (C) 2009 Atlas of Living Australia
 * All Rights Reserved.
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
package org.ala.harvester;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.ala.documentmapper.DocumentMapper;
import org.ala.documentmapper.FlickrDocumentMapper;
import org.ala.model.Licence;
import org.ala.repository.ParsedDocument;
import org.ala.repository.Repository;
import org.ala.repository.Triple;
import org.ala.util.DOMUtils;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.lang.time.DateUtils;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;

 * A Harvester class for Flickr. 
 * @author Dave Martin
public class FlickrHarvester implements Harvester {

    protected Logger logger = Logger.getLogger(FlickrHarvester.class);

    protected String endpoint;
    private String eolGroupId;
    private String userId;
    private String flickrRestBaseUrl;
    private String flickrApiKey;
    private int recordsPerPage;
    protected DocumentMapper documentMapper;
    protected Repository repository;
    protected int timeGap = 0;

     * Main method for testing this particular Harvester
     * @param args
    public static void main(String[] args) throws Exception {
        String[] locations = { "classpath*:spring.xml" };
        ApplicationContext context = new ClassPathXmlApplicationContext(locations);
        FlickrHarvester h = new FlickrHarvester();
        Repository r = (Repository) context.getBean("repository");
        h.setDocumentMapper(new FlickrDocumentMapper());

        //set the connection params   
        Map<String, String> connectParams = new HashMap<String, String>();
        //      connectParams.put("eolGroupId", "806927@N20");
        connectParams.put("eolGroupId", "22545712@N05");
        connectParams.put("flickrRestBaseUrl", "");
        connectParams.put("flickrApiKey", "08f5318120189e9d12669465c0113351");
        connectParams.put("recordsPerPage", "50");

        h.start(1106); //1013 is the ID for the data source flickr

     * @see org.ala.harvester.Harvester#setConnectionParams(java.util.Map)
    public void setConnectionParams(Map<String, String> connectionParams) {
        this.endpoint = connectionParams.get("endpoint");
        this.eolGroupId = connectionParams.get("eolGroupId");
        this.flickrRestBaseUrl = connectionParams.get("flickrRestBaseUrl");
        this.flickrApiKey = connectionParams.get("flickrApiKey");
        this.recordsPerPage = Integer.parseInt(connectionParams.get("recordsPerPage"));

    public void start(int infosourceId, int timeGap) throws Exception {
        this.timeGap = timeGap;

     * @see org.ala.harvester.Harvester#start()
    public void start(int infosourceId) throws Exception {

        //get licences maps
        Map<String, Licence> licences = getLicencesMap();

        int totalIndexed = 0;
        Date endDate = new Date();
        Date finalStartDate = DateUtils.parseDate("2004-01-01", new String[] { "yyyy-MM-dd" });
        if (System.getProperty("startDate") != null) {
            endDate = DateUtils.parseDate(System.getProperty("startDate"), new String[] { "yyyy-MM-dd" });
        if (System.getProperty("endDate") != null) {
            finalStartDate = DateUtils.parseDate(System.getProperty("endDate"), new String[] { "yyyy-MM-dd" });

        Date startDate = DateUtils.addDays(endDate, -1);
        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");

        // page through the images month-by-month
        while (startDate.after(finalStartDate)) {
  "Harvesting time period: " + df.format(startDate) + " to " + df.format(endDate));
            totalIndexed += indexTimePeriod(infosourceId, endDate, startDate);
            endDate = startDate;
            startDate = DateUtils.addDays(endDate, -1);
        }"Total harvested: " + totalIndexed);

    public int indexTimePeriod(int infosourceId, Date endDate, Date startDate) {
        int currentPageNum = 1; // index starts from 1
        int totalPages = -1;
        int imagesIndexed = 0;
        // TODO Auto-generated method stub
        while (totalPages == -1 || totalPages >= currentPageNum) {

            // Obtains the image listing on the page number specified.
            // Instance variable `currentResDom` will have new
            // DOM representation of the result.
            try {
                Document parsedDoc = getIndexPage(currentPageNum, startDate, endDate);

                if (isDocExtractionSuccessful(parsedDoc)) {

                    // returns {currentPageNum, totalPages, actualRecordsPerPage}
                    int[] counter = parseDataFragmentationInfo(parsedDoc);
                    totalPages = counter[1];
                    int photosInPage = counter[2];
          "Photos in result set: " + photosInPage);
                    // Process for each photo found in current index.
                    // In XPath array of elements starts with 1, as opposed to 0
                    for (int tempCount = 1; tempCount <= photosInPage; tempCount++) {
                        // catch error here and continue.
                        try {
                            boolean success = processSingleImage(infosourceId, tempCount, parsedDoc);
                            if (success)
                        } catch (Exception err) {
                                    "!!ERROR encountered in processing image @ " + "page: " + currentPageNum + " "
                                            + "photo: " + tempCount + " Skipping to next image. " + err.toString(),
                    } // End of looping through images in index page.

                } else {
                    logger.error("Extracting page number " + currentPageNum
                            + " returned error.  Skipping to next page number.");


            } catch (Exception e) {
                logger.error("Extracting page number " + currentPageNum
                        + " returned error.  Skipping to next page number.", e);
        } // End of infinite loop of parsing index page, then parsing individual images.
        return imagesIndexed;

     * Retrieves a map of licences.
     * @return
    private Map<String, Licence> getLicencesMap() throws Exception {

        final String flickrMethodUri = "";
        String urlToSearch = this.flickrRestBaseUrl + "/" + "?method=" + flickrMethodUri + "&api_key="
                + this.flickrApiKey;;
        logger.debug("URL to search is: " + "`" + urlToSearch + "`" + "\n");

        // Create an instance of HttpClient.
        HttpClient client = new HttpClient();

        // Create a method instance.
        GetMethod method = new GetMethod(urlToSearch);

        // Provide custom retry handler is necessary
        method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, "UTF-8");

        try {
            int statusCode = client.executeMethod(method);

            if (statusCode != HttpStatus.SC_OK) {
                String errMsg = "HTTP GET to " + "`" + urlToSearch + "`"
                        + " returned non HTTP OK code.  Returned code " + statusCode + " and message "
                        + method.getStatusLine() + "\n";
                throw new Exception(errMsg);

            //parse the response
            InputStream responseStream = method.getResponseBodyAsStream();
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();
            Document doc = builder.parse(responseStream);
            XPathFactory xfactory = XPathFactory.newInstance();
            XPath xpath = xfactory.newXPath();

            XPathExpression xe = xpath.compile("/rsp/licenses/license");
            NodeList nodeSet = (NodeList) xe.evaluate(doc, XPathConstants.NODESET);

            Map<String, Licence> licencesMap = new HashMap<String, Licence>();

            for (int i = 0; i < nodeSet.getLength(); i++) {
                NamedNodeMap map = nodeSet.item(i).getAttributes();
                String id = map.getNamedItem("id").getNodeValue();
                Licence licence = new Licence();
                licencesMap.put(id, licence);
            return licencesMap;

        } catch (Exception httpErr) {
            String errMsg = "HTTP GET to `" + urlToSearch + "` returned HTTP error.";
            throw new Exception(errMsg, httpErr);
        } finally {
            // Release the connection.

     * Process a single image, do the document mapping etc
     * @param infosourceId
     * @param imageIndex
     * @param currentResDom
     * @throws Exception
    private boolean processSingleImage(int infosourceId, int imageIndex, Document currentResDom) throws Exception {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();
        String xPathToPhotoId = "/rsp/photos/photo[" + imageIndex + "]/@id";
        String photoId = (String) xpath.evaluate(xPathToPhotoId, currentResDom, XPathConstants.STRING);"Handling photo ID: " + photoId);

        final String flickrMethod = "";

        // Calls the Flickr's Photo Info API to determine whether the photo
        // comes from Australia or not.
        String photoInfoFlickrUrl = this.flickrRestBaseUrl + "/" + "?method=" + flickrMethod + "&" + "api_key="
                + this.flickrApiKey + "&" + "photo_id=" + photoId;

        System.out.println("PHOTO URL:" + photoInfoFlickrUrl);

        org.w3c.dom.Document photoInfoDom = null;

        // Create an instance of HttpClient.
        HttpClient client = new HttpClient();
        // Create a method instance.
        GetMethod method = new GetMethod(photoInfoFlickrUrl);

        // Provide custom retry handler is necessary
        method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, "UTF-8");

                "Fetching info. for photo with ID " + photoId + " from " + "`" + photoInfoFlickrUrl + "`" + "\n");

        try {
            int statusCode = client.executeMethod(method);

            if (statusCode != HttpStatus.SC_OK) {
                String errMsg = "HTTP GET to " + "`" + photoInfoFlickrUrl + "`" + " returned non HTTP OK code.  "
                        + "Returned code " + statusCode + " and message " + method.getStatusLine() + "\n";
                throw new Exception(errMsg);

            InputStream responseStream = method.getResponseBodyAsStream();

            // Instantiates a DOM builder to create a DOM of the response.
            DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder domBuilder = domFactory.newDocumentBuilder();

            photoInfoDom = domBuilder.parse(responseStream);

        } catch (Exception domCreationErr) {
            throw new Exception("Failed to create DOM representation of GET response.", domCreationErr);

        } finally {
            // Release the connection.

        // Check for Flickr's error.
        if (!isDocExtractionSuccessful(photoInfoDom)) {
            throw new Exception("Flickr error response for fetching single image information.");

        if (System.getProperty("overwrite") != null && "false".equals(System.getProperty("overwrite"))) {
            String photoPageUrl = (String) xpath.evaluate("/rsp/photo/urls/url[@type=\"photopage\"]/text()",
                    photoInfoDom, XPathConstants.STRING);

            logger.debug("photo page URL: " + photoPageUrl);
            org.ala.model.Document doc = this.repository.getDocumentByGuid(photoPageUrl);
            if (doc != null) {
                logger.debug("Document with URI already harvested. Skipping: " + photoPageUrl);
                return true;

        // Determines whether photo has geo-coded tag from Australia.
        // If so, pass onto DocumentMapper.
        if (isPhotoFromAustralia(photoInfoDom)) {

            try {
                String document = (DOMUtils.domToString(photoInfoDom));
                // FIXME flickr GUID ???
                List<ParsedDocument> parsedDocs ="", document.getBytes());
                for (ParsedDocument parsedDoc : parsedDocs) {
                    this.repository.storeDocument(infosourceId, parsedDoc);
                return false;
            } catch (Exception docMapperErr) {
                // Skipping over errors here and proceeding to next document.
                logger.error("Problem processing image. " + docMapperErr.toString() + ", Problem processing: "
                        + photoInfoFlickrUrl, docMapperErr);
        } else {
            logger.debug("Photo is unAustralian: " + photoInfoFlickrUrl);

        return false;
    } // End of `processSingleImage` method.

     * Determines whether a Flickr photo has geo-coded location with Australia
     * as the country. <br />
     * XPath used to extract this information is
     * <code>/rsp/photo/location/country/text()</code> <br />
     * Non case-sensitive String comparison is performed.
     * @param photoInfoXmlDom
     *            DOM representation of XML result from calling
     *            <code></code> Flickr method.
     * @return <code>true</code> if photo has geo-coded location for Australia,
     *         <code>false</code> otherwise.
     * @throws csiro.diasb.protocolhandlers.Exception
     *             On error.
     * @since v0.4
    private boolean isPhotoFromAustralia(org.w3c.dom.Document photoInfoXmlDom) throws Exception {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();

        if (photoInfoXmlDom == null) {
            String errMsg = "DOM of Photo Info. XML has null reference.";
            throw new Exception(errMsg);

        String photoTitle = (String) xpath.evaluate("/rsp/photo/title/text()", photoInfoXmlDom,
        String photoDescription = (String) xpath.evaluate("/rsp/photo/description/text()", photoInfoXmlDom,
        String photoCountry = (String) xpath.evaluate("/rsp/photo/location/country/text()", photoInfoXmlDom,

        //check the machine tags
        String xPathToTags = "/rsp/photo/tags/tag/text()";
        NodeList nl = (NodeList) xpath.evaluate(xPathToTags, photoInfoXmlDom, XPathConstants.NODESET);
        for (int i = 0; i < nl.getLength(); i++) {
            String content = nl.item(i).getNodeValue();
            if (content != null) {
                content = content.toLowerCase();
                if (content.contains("australia")) {
                    return true;

        if ("australia".compareToIgnoreCase(photoCountry) == 0
                || (photoTitle != null && photoTitle.toLowerCase().contains("australia"))
                || (photoDescription != null && photoDescription.toLowerCase().contains("australia"))) {
            return true;

        return false;
    } // End of `isPhotoFromAustralia` method.

     * Parses the XML listing of images to obtain data necessary for future data
     * extraction. Specifically, the current page number, current images per
     * page and total number of pages.
     * @since v0.4
    private int[] parseDataFragmentationInfo(Document currentResDom) throws Exception {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();

        try {
            int currentPageNum = Integer
                    .parseInt((String) xpath.evaluate("/rsp/photos/@page", currentResDom, XPathConstants.STRING));
            int totalPages = Integer
                    .parseInt((String) xpath.evaluate("/rsp/photos/@pages", currentResDom, XPathConstants.STRING));
            int actualRecordsPerPage = Integer.parseInt(
                    (String) xpath.evaluate("count(/rsp/photos/photo)", currentResDom, XPathConstants.STRING));

            logger.debug("Extracted and set current page number to " + currentPageNum);
            logger.debug("Extracted and set total page number to " + totalPages);
            logger.debug("Actual number of records returned is " + actualRecordsPerPage);

            return new int[] { currentPageNum, totalPages, actualRecordsPerPage };

        } catch (XPathExpressionException getPageFragmentationError) {
            String errMsg = "Failed to obtain data fragmentation information from Flickr's REST response.";
            throw new Exception(errMsg, getPageFragmentationError);

    } // End of `parseHarvestFragementData` method.

    private boolean isDocExtractionSuccessful(org.w3c.dom.Document resDom) throws Exception {

        if (resDom == null) {
            return false;

        XPathFactory factory = XPathFactory.newInstance();
        XPath xpath = factory.newXPath();

        // <rsp stat="fail">
        // <err code="[error-code]" msg="[error-message]" />
        // </rsp>

        String xPathToStatus = "/rsp/@stat";

        String statusString = null;
        try {
            statusString = (String) xpath.evaluate(xPathToStatus, resDom, XPathConstants.STRING);
        } catch (XPathExpressionException getStatusStringErr) {
            String errMsg = "Failed to obtain Flickr REST response's status string.";
            throw new Exception(errMsg, getStatusStringErr);
        logger.debug("Response status: " + statusString);
        if ("ok".equals(statusString)) {
            return true;
        } else {
            logger.error("Error response status: " + statusString);

        // Status is false.
        String flickrErrCode = null;
        String flickrErrMsg = null;
        try {
            flickrErrCode = (String) xpath.evaluate("/rsp/err/@code", resDom, XPathConstants.STRING);
            flickrErrMsg = (String) xpath.evaluate("/rsp/err/@msg", resDom, XPathConstants.STRING);
        } catch (XPathExpressionException getErrDetailsErr) {
            String errMsg = "Failed to obtain Flickr REST response's error code and message.";
            throw new Exception(errMsg, getErrDetailsErr);

        String errMsg = "Flickr REST response returned error.  Code: " + flickrErrCode + " " + "Message: " + "`"
                + flickrErrMsg + "`" + "\n";

        return false;
    } // End of `isDocExtractionSuccessful` successful.

    private org.w3c.dom.Document getIndexPage(int pageNumber, Date startDate, Date endDate) throws Exception {

        final String flickrMethodUri = "";

        // Constructs the GET URL to search.

        // `woe_id` is Yahoo! Where On Earth ID.
        // Issue
        // to find Australia.
        // `woe_id` here is country level code, as opposed to continent code.

        SimpleDateFormat mysqlDateTime = new SimpleDateFormat("yyyy-MM-dd");

        String minUpdateDate = mysqlDateTime.format(startDate);
        String maxUpdateDate = mysqlDateTime.format(endDate);

        String urlToSearch = this.flickrRestBaseUrl + "/" + "?method=" + flickrMethodUri + "&content_type=1"
        //      + "&sort=date-posted-asc"
        //      + "&machine_tag_mode=any" 
        //      + "&group_id=" + this.eolGroupId
                + "&user_id=" + this.eolGroupId
                //      + "&accuracy=3" 
                + "&privacy_filter=1"
                //      + "&machine_tags=%22geo:country=Australia%22"
                //      + "&machine_tags=%22taxonomy:binomial=Pogona%20barbata%22"
                //      + "&tags=geo:country=Australia&country=Australia"
                //      + "&has_geo=1" 
                //      + "&accuracy=3" 
                //      + "&woe_id=23424748"
                // MYSQL date time
                + "&min_upload_date=" + minUpdateDate //startDate
                + "&max_upload_date=" + maxUpdateDate //endDate
                + "&api_key=" + this.flickrApiKey + "&page=" + pageNumber + "&per_page=" + this.recordsPerPage;

        //      String urlToSearch = 
        //      "" +
        //      "&api_key=08f5318120189e9d12669465c0113351" +
        //      "&page=1" +
        //      "&per_page=50" +
        //      "&machine_tag_mode=any" +
        //      "&content_type=1" +
        //      "&group_id=806927@N20&privacy_filter=1" +
        //      "&machine_tags=%22taxonomy:binomial=Pogona%20barbata%22";"Search URL: " + urlToSearch);

         * // Default parameters if not supplied. if (this.flickrApiKeySupplied
         * == false) { urlToSearch += "&" + "api_key=" + this.flickrApiKey; } if
         * (this.currentPageNumSupplied == false) { urlToSearch += "&" + "page="
         * + this.currentPageNum; } if (this.recordsPerPageSupplied == false) {
         * urlToSearch += "&" + "per_page=" + this.recordsPerPage; }
        logger.debug("URL to search is: " + "`" + urlToSearch + "`" + "\n");

        // Create an instance of HttpClient.
        HttpClient client = new HttpClient();

        // Create a method instance.
        GetMethod method = new GetMethod(urlToSearch);

        // Provide custom retry handler is necessary
        method.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_ELEMENT_CHARSET, "UTF-8");
        method.getParams().setParameter(HttpMethodParams.HTTP_URI_CHARSET, "UTF-8");

        try {
            int statusCode = client.executeMethod(method);

            if (statusCode != HttpStatus.SC_OK) {
                String errMsg = "HTTP GET to " + "`" + urlToSearch + "`"
                        + " returned non HTTP OK code.  Returned code " + statusCode + " and message "
                        + method.getStatusLine() + "\n";
                throw new Exception(errMsg);

            InputStream responseStream = method.getResponseBodyAsStream();

            // Instantiates a DOM builder to create a DOM of the response.
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();

            // return a parsed Document
            return builder.parse(responseStream);

        } catch (Exception httpErr) {
            String errMsg = "HTTP GET to `" + urlToSearch + "` returned HTTP error.";
            throw new Exception(errMsg, httpErr);
        } finally {
            // Release the connection.
    } // End of `getIndexPage` method.

    public void debugParsedDoc(ParsedDocument parsedDoc) {


        logger.debug("GUID: " + parsedDoc.getGuid());
        logger.debug("Content-Type: " + parsedDoc.getContentType());

        Map<String, String> dublinCore = parsedDoc.getDublinCore();
        for (String key : dublinCore.keySet()) {
            logger.debug("DC: " + key + "\t" + dublinCore.get(key));

        List<Triple<String, String, String>> triples = parsedDoc.getTriples();
        for (Triple<String, String, String> triple : triples) {
            logger.debug("RDF: " + triple.getSubject() + "\t" + triple.getPredicate() + "\t" + triple.getObject());


     * @see
     * org.ala.harvester.Harvester#setDocumentMapper(org.ala.documentmapper.
     * DocumentMapper)
    public void setDocumentMapper(DocumentMapper documentMapper) {
        this.documentMapper = documentMapper;

     * @see
     * org.ala.harvester.Harvester#setRepository(org.ala.repository.Repository)
    public void setRepository(Repository repository) {
        this.repository = repository;
