org.rdswitchboard.harvesters.pmh.Harvester.java Source code

Introduction

Here is the source code for org.rdswitchboard.harvesters.pmh.Harvester.java
Source

package org.rdswitchboard.harvesters.pmh;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import jdk.nashorn.internal.runtime.regexp.joni.Warnings;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.joda.time.DateTime;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.auth.InstanceProfileCredentialsProvider;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.util.StringUtils;

/**   
 * This file is part of RD-Switchboard.
 *
 * RD-Switchboard is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * OAI:PMH Harvester Library
 * 
 * This Library is designed to harvest any OAI:PMH repository and store the data locally.
 * The data will be stored as a collection of .xml files. The files will be stored as:
 * <p>
 * {@code
 *   <Base Folder>/<Metadata Name>/<Set Name>/<Record Identifier>.xml
 * }
 * <p>
 * The library will also create an index for every set. The index file will be opened and 
 * closed automatically. The index will be used to track record timestamp and updated record
 * file if needed. The set index will be stored as:
 * <p>
 * {@code 
 *   <Base Folder>/<Metadata Name>/<Set Name>.xml
 * }
 * <p>  
 * The library will cope outdated records in the cache before overwriting them. The cached files will 
 * be stored as:
 * <p>
 * {@code 
 *   <Base Folder>/<Metadata Name>/_cache/<Set Name>/<Record Identifier>_<Record Timestamp>.xml
 * }
 * <p>
 * While working, the library will also create temporary status file. The file could be used to track 
 * library work status. It also will be used to resume harvesting process, if it was terminated by
 * any reason. If such behavior is not needed, the status file must be deleted before calling the
 * harvest function. The status file will be stored as: 
 * <p>
 * {@code 
 *    <Base Folder>/<Metadata Name>/status.xml
 * }
 * <p>
 * To use the library, simple construct the Harvester object and then call the harvest function with 
 * desired medadata prefix. The harvest function can be called several times with different metadata
 * prefixes to harvest different metadata. Example:
 * <p><pre>
 * {@code 
 *    try {
 *      // CERN repository
 *       String repoUri = "http://cds.cern.ch/oai2d.py/";
 *
 *      // Base folder to store XML data
 *      String folderXml = "cern/xml";
 *  
 *      // Construct Harvester object.
 *      Harvester harvester = new Harvester(repoUri, folderXml);
 *         
 *      // List and display supported meatadata formats
 *       List<MetadataFormat> formats = harvester.listMetadataFormats();
 *       System.out.println("Supported metadata formats:");
 *       for (MetadataFormat format : formats) {
 *           System.out.println(format.toString());
 *       }
 *         
 *       // harvest the metdata with first prefix on the list
 *      harvester.harvest(formats.get(0).getMetadataPrefix));
 *   } catch (Exception e) {
 *       e.printStackTrace();
 *   }
 * }</pre>
 * 
 * @author Dmitrij Kudriavcev, dmitrij@kudriavcev.info
 * @version 1.1.1
 */
public class Harvester {

    private static final String URL_IDENTIFY = "?verb=Identify";
    private static final String URL_LIST_METADATA_FORMATS = "?verb=ListMetadataFormats";
    private static final String URL_LIST_SETS = "?verb=ListSets";
    private static final String URL_LIST_RECORDS = "?verb=ListRecords&set=%s&metadataPrefix=%s";
    private static final String URL_LIST_DEFAULT_RECORDS = "?verb=ListRecords&metadataPrefix=%s";
    private static final String URL_LIST_RECORDS_RESUMPTION_TOKEN = "?verb=ListRecords&resumptionToken=%s";
    private static final String URL_LIST_SETS_RESUMPTION_TOKEN = "?verb=ListSets&resumptionToken=%s";

    private static final String ERR_NO_RECORDS_MATCH = "noRecordsMatch";

    private static XPathExpression XPATH_OAI_PMH;
    private static XPathExpression XPATH_ERROR;
    private static XPathExpression XPATH_REPOSITORY_NAME;
    private static XPathExpression XPATH_PROTOCOL_VERSION;
    private static XPathExpression XPATH_EARLEST_TIMESHTAMP;
    private static XPathExpression XPATH_DELETED_RECORD;
    private static XPathExpression XPATH_GRANULARITY;
    private static XPathExpression XPATH_ADMIN_EMAIL;
    private static XPathExpression XPATH_LIST_SETS;
    private static XPathExpression XPATH_SET_NAME;
    private static XPathExpression XPATH_SET_SPEC;
    private static XPathExpression XPATH_RECORDS_RESUMPTION_TOKEN;
    private static XPathExpression XPATH_SETS_RESUMPTION_TOKEN;

    private static String harvestDate;

    static {
        XPath xPath = XPathFactory.newInstance().newXPath();
        try {
            XPATH_REPOSITORY_NAME = xPath.compile("/OAI-PMH/Identify/repositoryName/text()");
            XPATH_PROTOCOL_VERSION = xPath.compile("/OAI-PMH/Identify/protocolVersion/text()");
            XPATH_EARLEST_TIMESHTAMP = xPath.compile("/OAI-PMH/Identify/earliestTimestamp/text()");
            XPATH_DELETED_RECORD = xPath.compile("/OAI-PMH/Identify/deletedRecord/text()");
            XPATH_GRANULARITY = xPath.compile("/OAI-PMH/Identify/granularity/text()");
            XPATH_ADMIN_EMAIL = xPath.compile("/OAI-PMH/Identify/adminEmail/text()");

            XPATH_LIST_SETS = xPath.compile("/OAI-PMH/ListSets/set");
            XPATH_SET_NAME = xPath.compile("./setName/text()");
            XPATH_SET_SPEC = xPath.compile("./setSpec/text()");

            XPATH_OAI_PMH = xPath.compile("/OAI-PMH");
            XPATH_ERROR = xPath.compile("./error");
            XPATH_RECORDS_RESUMPTION_TOKEN = xPath.compile("./ListRecords/resumptionToken");
            XPATH_SETS_RESUMPTION_TOKEN = xPath.compile("/OAI-PMH/ListSets/resumptionToken");

            harvestDate = new SimpleDateFormat("yyyy-MM-dd").format(DateTime.now().toDate());

        } catch (XPathExpressionException e) {
            e.printStackTrace();
        }
    }

    /**
     * variable to store repo URL. Can not be null.
     */
    private String repoUrl;

    private String bucketName;
    private String folderName;

    private String repoPrefix;

    private String metadataPrefix;

    private final Map<String, SetStatus> processedSets = new HashMap<String, SetStatus>();

    private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

    private String repositoryName;
    private String protocolVersion;
    private String earliestTimestamp;
    private String deletedRecord;
    private String granularity;
    private String adminEmail;

    private Set<String> blackList;
    private Set<String> whiteList;

    private boolean failOnError;
    private int maxAttempts;
    private int attemptDelay;
    private int connectionTimeout;
    private int readTimeout;

    private AmazonS3 s3client;

    /**
     * Harvester constructor
     * 
     * @param repoUrl : The Repository URL
     * @param folderBase : The address of the folder, there received data must be saved.
     * @throws JAXBException 
     * @throws IOException 
     * @throws Exception 
     */
    public Harvester(final Properties properties) throws Exception {
        repoUrl = properties.getProperty("url");
        if (StringUtils.isNullOrEmpty(repoUrl))
            throw new IllegalArgumentException("The OAI:PMH Repository URL can not be empty");

        repoPrefix = properties.getProperty("name");

        if (StringUtils.isNullOrEmpty(repoPrefix))
            throw new IllegalArgumentException("The OAI:PMH Repository Prefix can not be empty");

        metadataPrefix = properties.getProperty("metadata");

        String accessKey = properties.getProperty("aws.access.key");
        String secretKey = properties.getProperty("aws.secret.key");
        if (StringUtils.isNullOrEmpty(accessKey) || StringUtils.isNullOrEmpty(secretKey))
            s3client = new AmazonS3Client(new InstanceProfileCredentialsProvider());
        else
            s3client = new AmazonS3Client(new BasicAWSCredentials(accessKey, secretKey));

        bucketName = properties.getProperty("s3.bucket");
        folderName = properties.getProperty("folder");

        if (StringUtils.isNullOrEmpty(bucketName) && StringUtils.isNullOrEmpty(folderName))
            throw new IllegalArgumentException(
                    "Please enter either local folder name or AWS S3 Bucket name to store the harvested files");
        if (!StringUtils.isNullOrEmpty(bucketName) && !StringUtils.isNullOrEmpty(folderName))
            throw new IllegalArgumentException(
                    "S3 bucket and local folder parameters can not be used at the same time. Please disable one in the configuration file.");

        try {
            File fileBlackList = new File(properties.getProperty("black.list"));
            if (fileBlackList.isFile()) {
                List<String> list = FileUtils.readLines(fileBlackList);
                blackList = new HashSet<String>();
                for (String l : list) {
                    String s = l.trim();
                    if (!s.isEmpty())
                        blackList.add(s);
                }
            }
        } catch (Exception e) {
            blackList = null;
        }

        try {
            File fileWhiteList = new File(properties.getProperty("white.list"));
            if (fileWhiteList.isFile()) {
                List<String> list = FileUtils.readLines(fileWhiteList);
                whiteList = new HashSet<String>();
                for (String l : list) {
                    String s = l.trim();
                    if (!s.isEmpty())
                        whiteList.add(s);
                }
            }
        } catch (Exception e) {
            whiteList = null;
        }

        if (null != blackList && !blackList.isEmpty() && null != whiteList && !whiteList.isEmpty())
            throw new Exception(
                    "The black and the withe list parameters can not be set at the same time. Please disable one in the configuration file.");

        connectionTimeout = Integer.parseInt(properties.getProperty("conn.timeout", "0"));
        readTimeout = Integer.parseInt(properties.getProperty("read.timeout", "0"));
        maxAttempts = Integer.parseInt(properties.getProperty("max.attempts", "0"));
        attemptDelay = Integer.parseInt(properties.getProperty("attempt.delay", "0"));
        failOnError = Boolean.parseBoolean(properties.getProperty("fail.on.error", "true"));

    }

    public void setWhiteList(Set<String> set) {
        whiteList = new HashSet<String>();
        whiteList.addAll(set);
    }

    /**
     * Return the repository name (available after calling identify() function).
     * @return String - Repository name
     */
    public String getRepositoryName() {
        return repositoryName;
    }

    /**
     * Return protocol version (available after calling identify() function).
     * @return String - protocol version
     */
    public String getProtocolVersion() {
        return protocolVersion;
    }

    /**
     * Return the earliest timestamp in the repository (available after calling identify() function).
     * @return String - earliest timestamp
     */
    public String getEarliestTimestamp() {
        return earliestTimestamp;
    }

    /**
     * Return deleted record behavior (available after calling identify() function).
     * @return String - deleted record behavior
     */
    public String getDeletedRecordBehavior() {
        return deletedRecord;
    }

    /**
     * Return granularity template (available after calling identify() function).
     * @return String - granularity template
     */
    public String getGranularityTemplate() {
        return granularity;
    }

    /**
     * Return admin email of repositiry (available after calling identify() function).
     * @return String - admin email
     */
    public String getAdminEmail() {
        return adminEmail;
    }

    public String getMetadataPrefix() {
        return metadataPrefix;
    }

    /**
     * Function to indentify on OAI:PMH Server. Could be used to test connection with the server.
     * Also will initialize all server information variables.
     * @return true if connection could be established and server didn't return any error.
     */
    public boolean identify() {
        String url = repoUrl + URL_IDENTIFY;

        try {
            Document doc = dbf.newDocumentBuilder().parse(url);

            repositoryName = (String) XPATH_REPOSITORY_NAME.evaluate(doc, XPathConstants.STRING);
            protocolVersion = (String) XPATH_PROTOCOL_VERSION.evaluate(doc, XPathConstants.STRING);
            earliestTimestamp = (String) XPATH_EARLEST_TIMESHTAMP.evaluate(doc, XPathConstants.STRING);
            deletedRecord = (String) XPATH_DELETED_RECORD.evaluate(doc, XPathConstants.STRING);
            granularity = (String) XPATH_GRANULARITY.evaluate(doc, XPathConstants.STRING);
            adminEmail = (String) XPATH_ADMIN_EMAIL.evaluate(doc, XPathConstants.STRING);

            return true;
        } catch (Exception e) {
            e.printStackTrace();
        }

        return false;
    }

    /**
     * Function to list supported metadata formats
     * @return {@code List<MetadataFormat>} - list of metadata formats
     */
    public List<MetadataFormat> listMetadataFormats() {
        String url = repoUrl + URL_LIST_METADATA_FORMATS;

        try {
            Document doc = dbf.newDocumentBuilder().parse(url);

            return MetadataFormat.getMetadataFormats(doc);
        } catch (Exception e) {
            e.printStackTrace();
        }

        return null;
    }

    /**
     * Function to list sets
     * @return Map<String, String> where Key is set name and Value is set specification
     */
    public Map<String, String> listSets() {
        String url = null;
        String tokenString = null;
        try {
            Map<String, String> mapSets = new HashMap<String, String>();

            do {
                if (tokenString == null) {
                    url = repoUrl + URL_LIST_SETS;
                } else {
                    url = repoUrl + String.format(URL_LIST_SETS_RESUMPTION_TOKEN,
                            URLEncoder.encode(tokenString, "UTF-8"));
                }
                Document doc = dbf.newDocumentBuilder().parse(url);
                NodeList sets = (NodeList) XPATH_LIST_SETS.evaluate(doc, XPathConstants.NODESET);
                for (int i = 0; i < sets.getLength(); i++) {
                    Node set = sets.item(i);
                    String setName = (String) XPATH_SET_NAME.evaluate(set, XPathConstants.STRING);
                    String setGroup = (String) XPATH_SET_SPEC.evaluate(set, XPathConstants.STRING);

                    if (mapSets.put(setGroup, setName) != null) {
                        System.out.println(
                                "Warning, the group already exists in the set: " + setGroup + " | " + setName);
                    }
                }

                Node nodeToken = (Node) XPATH_SETS_RESUMPTION_TOKEN.evaluate(doc, XPathConstants.NODE);
                if (null != nodeToken && nodeToken instanceof Element) {
                    tokenString = ((Element) nodeToken).getTextContent();
                }
            } while (null != tokenString && !tokenString.isEmpty());

            return mapSets;
        } catch (Exception e) {
            e.printStackTrace();
        }

        return null;
    }

    public void downloadRecords(SetStatus set) throws HarvesterException, UnsupportedEncodingException, IOException,
            InterruptedException, XPathExpressionException, SAXException, ParserConfigurationException {

        // Generate the URL of request
        String url = null;
        ;
        if (set.hasToken()) {
            try {
                url = repoUrl + String.format(URL_LIST_RECORDS_RESUMPTION_TOKEN,
                        URLEncoder.encode(set.getToken(), "UTF-8"));
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
        }
        if (null == url) {
            if (!set.hasName())
                url = repoUrl + String.format(URL_LIST_DEFAULT_RECORDS, metadataPrefix);
            else
                url = repoUrl + String.format(URL_LIST_RECORDS, URLEncoder.encode(set.getName(), "UTF-8"),
                        metadataPrefix);
        }

        System.out.println("Downloading records: " + url);

        String xml = null;

        // Get XML document 
        URLConnection conn = new URL(url).openConnection();
        if (connectionTimeout > 0)
            conn.setConnectTimeout(connectionTimeout);
        if (readTimeout > 0)
            conn.setReadTimeout(readTimeout);
        try (InputStream is = conn.getInputStream()) {
            if (null != is)
                xml = IOUtils.toString(is, StandardCharsets.UTF_8.name());
        }

        // Check if xml has been returned and check what it had a valid root element
        if (null == xml)
            throw new HarvesterException("The XML document is empty");
        Document doc = dbf.newDocumentBuilder().parse(new InputSource(new StringReader(xml)));

        // Extract root node
        Node root = (Node) XPATH_OAI_PMH.evaluate(doc, XPathConstants.NODE);
        if (null == root)
            throw new HarvesterException("The document is not an OAI:PMH file");

        // Check for error node
        Node error = (Node) XPATH_ERROR.evaluate(root, XPathConstants.NODE);
        if (null != error && error instanceof Element) {
            String code = ((Element) error).getAttribute("code");
            String message = ((Element) error).getTextContent();

            if (ERR_NO_RECORDS_MATCH.equals(code)) {
                System.out.println("Error: The set is empty");

                set.setFiles(0);
                set.resetToken();

                return;
            } else
                throw new HarvesterException(code, message);
        }

        Node nodeToken = (Node) XPATH_RECORDS_RESUMPTION_TOKEN.evaluate(root, XPathConstants.NODE);

        if (null != nodeToken && nodeToken instanceof Element) {
            String tokenString = ((Element) nodeToken).getTextContent();
            if (null != tokenString && !tokenString.isEmpty())
                set.setToken(tokenString);
            else
                set.resetToken();

            set.setCursor(((Element) nodeToken).getAttribute("cursor"));
            set.setSize(((Element) nodeToken).getAttribute("completeListSize"));

            set.dumpToken(System.out);
        } else
            set.resetToken();

        String filePath = repoPrefix + "/" + metadataPrefix + "/" + harvestDate + "/" + set.getNameSafe() + "/"
                + set.getFiles() + ".xml";

        if (StringUtils.isNullOrEmpty(bucketName)) {

            FileUtils.writeStringToFile(new File(folderName, filePath), xml);

        } else {
            byte[] bytes = xml.getBytes(StandardCharsets.UTF_8);

            ObjectMetadata metadata = new ObjectMetadata();
            metadata.setContentEncoding(StandardCharsets.UTF_8.name());
            metadata.setContentType("text/xml");
            metadata.setContentLength(bytes.length);

            InputStream inputStream = new ByteArrayInputStream(bytes);

            PutObjectRequest request = new PutObjectRequest(bucketName, filePath, inputStream, metadata);

            s3client.putObject(request);
        }

        set.incFiles();
    }

    /**
     * Main function to organize the standard harvest process. The function will identify 
     * on the server and will download list of the sets. Then it will download each set, 
     * creating index, if needed. Every record of the set will be stored in the separate 
     * file in the set folder. If function will find, that some record has been changed, 
     * the old record will be stored in the set cache folder. The function will also create 
     * and update harvesting status in the xml file under repo base folder. This file will 
     * be deleted after harvesting process will be finished. If, by any reason, harvesting 
     * process will be terminated, the status file will be left on disk. In this case the 
     * function will try to load in and resume it work. If, by some reason this behaviour 
     * is not required, the status file must be deleted before calling this function.
     * @param prefix A metadata prefix
     * @throws Exception
     */

    /**
     * Alternative function to organize the harvest process. The difference with another function
     * is in data storage. The harvest2 function will store files in the raw format as they come
     * from the server.
     * The harvesting method should never be mixed. The harvesting folder must be wiped out if 
     * switching to this method, or function will fail.
     * @param prefix A metadata prefix
     * @throws Exception
     */
    public boolean harvest() throws Exception {
        if (StringUtils.isNullOrEmpty(metadataPrefix))
            throw new IllegalArgumentException("The OAI:PMH Metadata Prefix can not be empty");

        System.out.println("Downloading set list");

        boolean result = false;

        if (null == whiteList || whiteList.isEmpty()) {

            System.out.println(
                    "There is no whitelist found. Proceeding with downloading the list of all available sets.");

            // download all sets in the repository
            Map<String, String> mapSets = listSets();

            if (null == mapSets || mapSets.isEmpty()) {
                System.out.println("Processing default set");

                result = harvestSet(new SetStatus(null, "Default"));
            } else {

                result = false;

                for (Map.Entry<String, String> entry : mapSets.entrySet()) {

                    SetStatus set = new SetStatus(entry.getKey().trim(),
                            URLDecoder.decode(entry.getValue(), StandardCharsets.UTF_8.name()));

                    // if black list exists and item is blacklisted, continue
                    if (null != blackList && blackList.contains(set)) {
                        set.setFiles(-2);
                        saveSetStats(set); // set was ignored
                        continue;
                    }

                    System.out.println("Processing set: "
                            + URLDecoder.decode(entry.getValue(), StandardCharsets.UTF_8.name()));

                    if (!harvestSet(set)) {
                        System.err.println(
                                "The harvesting job has been aborted due to an error. If you want harvesting to be continued, please set option 'fail.on.error' to 'false' in the configuration file");
                        result = false;
                        break;
                    } else
                        result = true;
                }

            }
        } else {
            for (String item : whiteList) {
                if (!harvestSet(new SetStatus(item, item))) {
                    System.err.println(
                            "The harvesting job has been aborted due to an error. If you want harvesting to be continued, please set option 'fail.on.error' to 'false' in the configuration file");
                    result = false;
                    break;
                } else
                    result = true;
            }

        }
        if (result) {
            String filePath = repoPrefix + "/" + metadataPrefix + "/latest.txt";

            if (StringUtils.isNullOrEmpty(bucketName)) {

                FileUtils.writeStringToFile(new File(folderName, filePath), harvestDate);

            } else {

                byte[] bytes = harvestDate.getBytes(StandardCharsets.UTF_8);

                ObjectMetadata metadata = new ObjectMetadata();
                metadata.setContentEncoding(StandardCharsets.UTF_8.name());
                metadata.setContentType("text/plain");
                metadata.setContentLength(bytes.length);

                InputStream inputStream = new ByteArrayInputStream(bytes);

                PutObjectRequest request = new PutObjectRequest(bucketName, filePath, inputStream, metadata);

                s3client.putObject(request);
            }
        }

        return result;
    }

    private boolean harvestSet(SetStatus set) throws Exception {
        long mark = System.currentTimeMillis();

        do {
            for (int nAttempt = 0; nAttempt <= maxAttempts; ++nAttempt)
                try {
                    downloadRecords(set);

                    break;
                } catch (Exception e) {
                    // only for debug!
                    e.printStackTrace();

                    if (nAttempt == maxAttempts) {

                        System.err.println("Error: " + e.getMessage());

                        set.setError(e.getMessage());
                        set.resetToken();

                        break;
                    }

                    Thread.sleep(attemptDelay);
                }
        } while (set.hasToken());

        set.setMilliseconds(System.currentTimeMillis() - mark);
        saveSetStats(set);

        if (failOnError && set.hasError())
            return false;

        return true;
    }

    public void printStatistics(boolean result, PrintStream out) {
        out.println();
        if (result)
            out.println("The harvesting process has been finished successfully");
        else
            out.println("The harvesting process has been completed with some errors");
        int errorSets = 0;
        int harvestedSets = 0;
        int emptySets = 0;
        int ignoredSets = 0;
        int blacklistedSets = 0;

        for (SetStatus set : processedSets.values()) {
            if (set.getFiles() == 0)
                ++emptySets;
            else if (set.getFiles() > 0)
                ++harvestedSets;
            else if (set.getFiles() == -1)
                ++ignoredSets;
            else if (set.getFiles() == -2)
                ++blacklistedSets;
            else if (set.getFiles() == -3)
                ++errorSets;
        }

        if (harvestedSets > 0) {
            out.println();
            out.println(String.format("Successfully harvested %s %s:", harvestedSets,
                    harvestedSets == 1 ? "set" : "sets"));
            int counter = 1;
            for (SetStatus set : processedSets.values())
                if (set.getFiles() > 0)
                    out.println(String.format("Set %d. %s (%s): %d %s has been harvested in %s", counter++,
                            set.getTitle(), set.getName(), set.getFiles(), set.getFiles() == 1 ? "file" : "files",
                            set.getEllapsedTime()));
        }

        if (emptySets > 0) {
            out.println();
            out.println(String.format("%d %s has been empty:", emptySets, emptySets == 1 ? "set" : "sets"));
            int counter = 1;
            for (SetStatus set : processedSets.values())
                if (set.getFiles() == 0)
                    out.println(String.format("Set %d. %s (%s)", counter++, set.getTitle(), set.getName()));
        }

        if (ignoredSets > 0) {
            out.println();
            out.println(String.format("%d %s has been ignored by the WHITE list:", ignoredSets,
                    ignoredSets == 1 ? "set" : "sets"));
            int counter = 1;
            for (SetStatus set : processedSets.values())
                if (set.getFiles() == -1)
                    out.println(String.format("Set %d. %s (%s)", counter++, set.getTitle(), set.getName()));
        }

        if (blacklistedSets > 0) {
            out.println();
            out.println(String.format("%d %s has been ignored by the black list:", blacklistedSets,
                    blacklistedSets == 1 ? "set" : "sets"));
            int counter = 1;
            for (SetStatus set : processedSets.values())
                if (set.getFiles() == -2)
                    out.println(String.format("Set %d. %s (%s)", counter++, set.getTitle(), set.getName()));
        }

        if (errorSets > 0) {
            out.println();
            out.println(String.format("%d %s has got an error:", errorSets, errorSets == 1 ? "set" : "sets"));
            int counter = 1;
            for (SetStatus set : processedSets.values())
                if (set.getFiles() == -3)
                    out.println(String.format("Set %d. %s (%s) error: %s", counter++, set.getTitle(), set.getName(),
                            set.getError()));
        }
    }

    private void saveSetStats(SetStatus set) {
        processedSets.put(set.getNameSafe(), set);
    }

    /**
     * Function to get Repo URL
     * @return String - Repo URL
     */
    public String getRepoUrl() {
        return repoUrl;
    }

    /**
     * Function to set Repo URL
     * @param repoUrl an Repo URL
     */
    public void setRepoUrl(String repoUrl) {
        this.repoUrl = repoUrl;
    }
}