eu.clarin.cmdi.vlo.importer.MetadataImporter.java Source code

Introduction

Here is the source code for eu.clarin.cmdi.vlo.importer.MetadataImporter.java
Source

package eu.clarin.cmdi.vlo.importer;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.clarin.cmdi.vlo.CommonUtils;
import eu.clarin.cmdi.vlo.FacetConstants;
import eu.clarin.cmdi.vlo.LanguageCodeUtils;
import eu.clarin.cmdi.vlo.StringUtils;
import eu.clarin.cmdi.vlo.config.DataRoot;
import eu.clarin.cmdi.vlo.config.VloConfig;
import eu.clarin.cmdi.vlo.config.XmlVloConfigFactory;
import java.time.LocalDate;
import java.time.ZoneId;
import static java.time.temporal.ChronoUnit.DAYS;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;

/**
 * The main metadataImporter class. Also contains the main function.
 *
 * The metadataimporter reads all the config files and then, for each
 * metadatafile in each defined directory structure parses and imports them as
 * defined in the configuration. The startImport function starts the importing
 * and so on.
 */
public class MetadataImporter {

    private static final int SOLR_SERVER_THREAD_COUNT = 2;
    /**
     * Defines which files to try and parse. In this case all files ending in
     * "xml" or "cmdi".
     */
    private static final String[] VALID_CMDI_EXTENSIONS = new String[] { "xml", "cmdi" };

    /**
     * Log log log log
     */
    protected final static Logger LOG = LoggerFactory.getLogger(MetadataImporter.class);
    /**
     * Some place to store errors.
     */
    private static Throwable serverError;
    /**
     * the solr server.
     */
    private ConcurrentUpdateSolrServer solrServer;
    /**
     * Defines the post-processor associations. At import, for each facet value,
     * this map is checked and all postprocessors associated with the facet
     * _type_ are applied to the value before storing the new value in the solr
     * document.
     */
    final static Map<String, PostProcessor> POST_PROCESSORS = new HashMap<>();

    static {
        POST_PROCESSORS.put(FacetConstants.FIELD_ID, new IdPostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_CONTINENT, new ContinentNamePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_COUNTRY, new CountryNamePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_CODE, new LanguageCodePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_LANGUAGE_NAME, new LanguageNamePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_AVAILABILITY, new AvailabilityPostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_ORGANISATION, new OrganisationPostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_TEMPORAL_COVERAGE, new TemporalCoveragePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_NATIONAL_PROJECT, new NationalProjectPostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_CLARIN_PROFILE, new CMDIComponentProfileNamePostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_RESOURCE_CLASS, new ResourceClassPostProcessor());
        POST_PROCESSORS.put(FacetConstants.FIELD_LICENSE, new LicensePostProcessor());
    }

    /**
     * Constructor
     */
    public MetadataImporter() {
    }

    public MetadataImporter(String clDatarootsList) {
        this.clDatarootsList = clDatarootsList;
    }

    /**
     * Contains MDSelflinks (usually). Just to know what we have already done.
     */
    protected final Set<String> processedIds = new HashSet<>();
    /**
     * Some caching for solr documents (we are more efficient if we ram a whole
     * bunch to the solr server at once.
     */
    protected List<SolrInputDocument> docs = new ArrayList<>();

    // SOME STATS
    protected int nrOFDocumentsSend;
    protected int nrOfFilesAnalyzed = 0;
    protected int nrOfFilesWithoutId = 0;
    protected int nrOfFilesWithError = 0;
    protected int nrOfFilesTooLarge = 0;

    /**
     * Retrieve all files with VALID_CMDI_EXTENSIONS from all DataRoot entries
     * and starts processing for every single file
     *
     * @throws MalformedURLException
     */
    void startImport() throws MalformedURLException {

        initSolrServer();
        List<DataRoot> dataRoots = checkDataRoots();
        dataRoots = filterDataRootsWithCLArgs(dataRoots);

        long start = System.currentTimeMillis();
        try {
            // Delete the whole Solr db
            if (config.getDeleteAllFirst()) {
                LOG.info("Deleting original data...");
                solrServer.deleteByQuery("*:*");
                solrServer.commit();
                LOG.info("Deleting original data done.");
            }

            // Import the specified data roots
            for (DataRoot dataRoot : dataRoots) {
                LOG.info("Start of processing: " + dataRoot.getOriginName());
                if (dataRoot.deleteFirst()) {
                    LOG.info("Deleting data for data provider: " + dataRoot.getOriginName());
                    solrServer.deleteByQuery(FacetConstants.FIELD_DATA_PROVIDER + ":"
                            + ClientUtils.escapeQueryChars(dataRoot.getOriginName()));
                    LOG.info("Deleting data of provider done.");
                }
                CMDIDataProcessor processor = new CMDIParserVTDXML(POST_PROCESSORS, config, false);
                List<List<File>> centreFilesList = getFilesFromDataRoot(dataRoot.getRootFile());
                // import files from every endpoint
                for (List<File> centreFiles : centreFilesList) {
                    LOG.info("Processing directory: {}", centreFiles.get(0).getParent());

                    // identify mdSelfLinks and remove too large files from center file list
                    LOG.info("Extracting mdSelfLinks");
                    Set<String> mdSelfLinkSet = new HashSet<>();
                    Set<File> ignoredFileSet = new HashSet<>();
                    for (File file : centreFiles) {
                        if (config.getMaxFileSize() > 0 && file.length() > config.getMaxFileSize()) {
                            LOG.info("Skipping " + file.getAbsolutePath() + " because it is too large.");
                            nrOfFilesTooLarge++;
                            ignoredFileSet.add(file);
                        } else {
                            String mdSelfLink = null;
                            try {
                                mdSelfLink = processor.extractMdSelfLink(file);
                            } catch (Exception e) {
                                LOG.error("error in file: {}", file, e);
                                nrOfFilesWithError++;
                            }
                            if (mdSelfLink != null) {
                                mdSelfLinkSet.add(StringUtils.normalizeIdString(mdSelfLink));
                            }
                        }
                    }
                    centreFiles.removeAll(ignoredFileSet);

                    // inform structure graph about MdSelfLinks of all files in this collection
                    ResourceStructureGraph.setOccurringMdSelfLinks(mdSelfLinkSet);
                    LOG.info("...extracted {} mdSelfLinks", mdSelfLinkSet.size());

                    // process every file in this collection
                    for (File file : centreFiles) {
                        LOG.debug("PROCESSING FILE: {}", file.getAbsolutePath());
                        processCmdi(file, dataRoot, processor);
                    }
                    if (!docs.isEmpty()) {
                        sendDocs();
                    }
                    solrServer.commit();
                    if (config.isProcessHierarchies()) {
                        updateDocumentHierarchy();
                    }
                }
                updateDaysSinceLastImport(dataRoot);
                LOG.info("End of processing: " + dataRoot.getOriginName());
            }

            // delete outdated entries (based on maxDaysInSolr parameter)
            if (config.getMaxDaysInSolr() > 0 && config.getDeleteAllFirst() == false) {
                LOG.info("Deleting old files that were not seen for more than " + config.getMaxDaysInSolr()
                        + " days...");
                solrServer.deleteByQuery(
                        FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-" + config.getMaxDaysInSolr() + "DAYS]");
                LOG.info("Deleting old files done.");
            }
        } catch (SolrServerException e) {
            LOG.error("error updating files:\n", e);
            LOG.error("Also see vlo_solr server logs for more information");
        } catch (IOException e) {
            LOG.error("error updating files:\n", e);
        } finally {
            try {
                if (solrServer != null) {
                    solrServer.commit();
                    buildSuggesterIndex();
                }
            } catch (SolrServerException | IOException e) {
                LOG.error("cannot commit:\n", e);
            }
        }
        long took = (System.currentTimeMillis() - start) / 1000;
        LOG.info("Found " + nrOfFilesWithoutId
                + " file(s) without an id. (id is generated based on fileName but that may not be unique)");
        LOG.info("Found " + nrOfFilesWithError + " file(s) with errors.");
        LOG.info("Found " + nrOfFilesTooLarge + " file(s) too large.");
        LOG.info("Update of " + nrOFDocumentsSend + " took " + took + " secs. Total nr of files analyzed "
                + nrOfFilesAnalyzed);
        solrServer.shutdown();
    }

    /**
     * Check a List of DataRoots for existence of RootFile (typically parent
     * directory of metadata files)
     *
     * @return
     */
    protected List<DataRoot> checkDataRoots() {
        List<DataRoot> dataRoots = config.getDataRoots();
        List<DataRoot> existingDataRoots = new LinkedList<>();
        for (DataRoot dataRoot : dataRoots) {
            if (!dataRoot.getRootFile().exists()) {
                LOG.warn("Root file " + dataRoot.getRootFile()
                        + " does not exist. It could be configuration error! Proceeding with next ...");
            } else {
                existingDataRoots.add(dataRoot);
            }

        }
        return existingDataRoots;
    }

    /**
     * if user specified which data roots should be imported, list of existing
     * data roots will be filtered with the list from user
     *
     * @param dataRoots complete list of DataRoots
     * @return list of DataRoots without DataRoots excluded by the user
     */
    protected List<DataRoot> filterDataRootsWithCLArgs(List<DataRoot> dataRoots) {
        if (clDatarootsList == null) {
            return dataRoots;
        }

        LOG.info("Filtering configured data root files with command line arguments: \"" + clDatarootsList + "\"");

        LinkedList<File> fsDataRoots = new LinkedList<>();

        List<String> paths = Arrays.asList((clDatarootsList.split("\\s+")));

        //Convert String paths to File objects for comparison
        for (String path : paths) {
            fsDataRoots.add(new File(path));
        }

        List<DataRoot> filteredDataRoots = new LinkedList<>();
        try {
            //filter data
            dr: for (DataRoot dataRoot : dataRoots) {
                for (File fsDataRoot : fsDataRoots) {
                    if (fsDataRoot.getCanonicalPath().equals(dataRoot.getRootFile().getCanonicalPath())) {
                        filteredDataRoots.add(dataRoot);
                        fsDataRoots.remove(fsDataRoot);
                        continue dr;
                    }
                }
                LOG.info("Root file " + dataRoot.getRootFile() + " will be omitted from processing");
            }
        } catch (IOException e) {
            filteredDataRoots = dataRoots;
        }

        return filteredDataRoots;
    }

    /**
     * Get all files with VALID_CMDI_EXTENSIONS if rootFile is a directory that
     * contains center directories or rootFile if it is a file
     *
     * @param rootFile
     * @return List with centre Lists of all contained CMDI files if rootFile is
     * a directory or rootFile if it is a File
     */
    protected List<List<File>> getFilesFromDataRoot(File rootFile) {
        List<List<File>> result = new ArrayList<>();
        if (rootFile.isFile()) {
            List<File> singleFileList = new ArrayList<>();
            singleFileList.add(rootFile);
            result.add(singleFileList);
        } else {
            File[] centerDirs = rootFile.listFiles();
            for (File centerDir : centerDirs) {
                List<File> centerFileList = new ArrayList<>();
                if (centerDir.isDirectory()) {
                    centerFileList.addAll(FileUtils.listFiles(centerDir, VALID_CMDI_EXTENSIONS, true));
                }

                if (!centerFileList.isEmpty()) {
                    result.add(centerFileList);
                }
            }
        }
        return result;
    }

    /**
     * Create an interface to the SOLR server.
     *
     * After the interface has been created the importer can send documents to
     * the server. Sending documents involves a queue. The importer adds
     * documents to a queue, and dedicated threads will empty it, and
     * effectively store store the documents.
     *
     * @throws MalformedURLException
     */
    protected void initSolrServer() throws MalformedURLException {
        String solrUrl = config.getSolrUrl();
        LOG.info("Initializing concurrent Solr Server on {} with {} threads", solrUrl, SOLR_SERVER_THREAD_COUNT);

        /* Specify the number of documents in the queue that will trigger the
         * threads, two of them, emptying it.
         */
        solrServer = new ConcurrentUpdateSolrServer(solrUrl, config.getMinDocsInSolrQueue(),
                SOLR_SERVER_THREAD_COUNT) {
            /*
                 * Let the super class method handle exceptions. Make the
                 * exception available to the importer in the form of the
                 * serverError variable.
             */
            @Override
            public void handleError(Throwable exception) {
                super.handleError(exception);
                serverError = exception;
            }
        };
    }

    /**
     * Process single CMDI file with CMDIDataProcessor
     *
     * @param file CMDI input file
     * @param dataOrigin
     * @param processor
     * @throws SolrServerException
     * @throws IOException
     */
    protected void processCmdi(File file, DataRoot dataOrigin, CMDIDataProcessor processor)
            throws SolrServerException, IOException {
        nrOfFilesAnalyzed++;
        CMDIData cmdiData = null;
        try {
            cmdiData = processor.process(file);
            if (!idOk(cmdiData.getId())) {
                cmdiData.setId(dataOrigin.getOriginName() + "/" + file.getName()); //No id found in the metadata file so making one up based on the file name. Not quaranteed to be unique, but we have to set something.
                nrOfFilesWithoutId++;
            }
        } catch (Exception e) {
            LOG.error("error in file: {}", file, e);
            nrOfFilesWithError++;
        }
        if (cmdiData != null) {
            if (processedIds.add(cmdiData.getId())) {
                SolrInputDocument solrDocument = cmdiData.getSolrDocument();
                if (solrDocument != null) {
                    updateDocument(solrDocument, cmdiData, file, dataOrigin);
                    if (config.isProcessHierarchies()
                            && ResourceStructureGraph.getVertex(cmdiData.getId()) != null) {
                        ResourceStructureGraph.getVertex(cmdiData.getId()).setWasImported(true);
                    }
                }
            } else {
                LOG.warn("Skipping {}, already processed id: {}", file, cmdiData.getId());
            }
        }
    }

    /**
     * Check id for validness
     *
     * @param id
     * @return true if id is acceptable, false otherwise
     */
    protected boolean idOk(String id) {
        return id != null && !id.isEmpty();
    }

    /**
     * Adds some additional information from DataRoot to solrDocument, add
     * solrDocument to document list, submits list to SolrServer every 1000
     * files
     *
     * @param solrDocument
     * @param cmdiData
     * @param file
     * @param dataOrigin
     * @throws SolrServerException
     * @throws IOException
     */
    protected void updateDocument(SolrInputDocument solrDocument, CMDIData cmdiData, File file, DataRoot dataOrigin)
            throws SolrServerException, IOException {
        if (!solrDocument.containsKey(FacetConstants.FIELD_COLLECTION)) {
            solrDocument.addField(FacetConstants.FIELD_COLLECTION, dataOrigin.getOriginName());
        }
        solrDocument.addField(FacetConstants.FIELD_DATA_PROVIDER, dataOrigin.getOriginName());
        solrDocument.addField(FacetConstants.FIELD_ID, cmdiData.getId());
        solrDocument.addField(FacetConstants.FIELD_FILENAME, file.getAbsolutePath());

        String metadataSourceUrl = dataOrigin.getPrefix();
        metadataSourceUrl += file.getAbsolutePath().substring(dataOrigin.getToStrip().length());

        solrDocument.addField(FacetConstants.FIELD_COMPLETE_METADATA, metadataSourceUrl);

        // add SearchServices (should be CQL endpoint)
        for (Resource resource : cmdiData.getSearchResources()) {
            solrDocument.addField(FacetConstants.FIELD_SEARCH_SERVICE, resource.getResourceName());
        }

        // add landing page resource
        for (Resource resource : cmdiData.getLandingPageResources()) {
            solrDocument.addField(FacetConstants.FIELD_LANDINGPAGE, resource.getResourceName());
        }

        // add search page resource
        for (Resource resource : cmdiData.getSearchPageResources()) {
            solrDocument.addField(FacetConstants.FIELD_SEARCHPAGE, resource.getResourceName());
        }

        // add timestamp
        Date dt = new Date();
        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
        solrDocument.addField(FacetConstants.FIELD_LAST_SEEN, df.format(dt));

        // set number of days since last import to '0'
        solrDocument.addField(FacetConstants.FIELD_DAYS_SINCE_LAST_SEEN, 0);

        // add resource proxys      
        addResourceData(solrDocument, cmdiData);

        LOG.debug("Adding document for submission to SOLR: {}", file);
        docs.add(solrDocument);
        if (docs.size() == config.getMaxDocsInList()) {
            sendDocs();
        }
    }

    /**
     * Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
     * specified in the "ResourceType" element of an imdi file or possibly
     * overwritten by some more specific xpath (as in the LRT cmdi files). So if
     * a type is overwritten and already in the solrDocument we take that type.
     *
     * @param solrDocument
     * @param cmdiData
     */
    protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
        List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT)
                ? new ArrayList<>(solrDocument.getFieldValues(FacetConstants.FIELD_FORMAT))
                : null;
        solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
        List<Resource> resources = cmdiData.getDataResources();
        for (int i = 0; i < resources.size(); i++) {
            Resource resource = resources.get(i);
            String mimeType = resource.getMimeType();
            if (mimeType == null) {
                if (fieldValues != null && i < fieldValues.size()) {
                    mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString());
                } else {
                    mimeType = CommonUtils.normalizeMimeType("");
                }
            }

            FormatPostProcessor processor = new FormatPostProcessor();
            mimeType = processor.process(mimeType).get(0);

            // TODO check should probably be moved into Solr (by using some minimum length filter)
            if (!mimeType.equals("")) {
                solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
            }
            solrDocument.addField(FacetConstants.FIELD_RESOURCE,
                    mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR + resource.getResourceName());
        }
        solrDocument.addField(FacetConstants.FIELD_RESOURCE_COUNT, resources.size());
    }

    /**
     * Send current list of SolrImputDocuments to SolrServer and clears list
     * afterwards
     *
     * @throws SolrServerException
     * @throws IOException
     */
    protected void sendDocs() throws SolrServerException, IOException {
        LOG.info("Sending " + docs.size() + " docs to solr server. Total number of docs updated till now: "
                + nrOFDocumentsSend);
        nrOFDocumentsSend += docs.size();
        solrServer.add(docs);
        if (serverError != null) {
            throw new SolrServerException(serverError);
        }
        docs = new ArrayList<>();
    }

    /**
     * Builds suggester index for autocompletion
     *
     * @throws SolrServerException
     * @throws MalformedURLException
     */
    private void buildSuggesterIndex() throws SolrServerException, MalformedURLException {
        LOG.info("Building index for autocompletion.");
        HashMap<String, String> paramMap = new HashMap<>();
        paramMap.put("qt", "/suggest");
        paramMap.put("spellcheck.build", "true");
        SolrParams params = new MapSolrParams(paramMap);
        solrServer.query(params);
    }

    /**
     * Updates documents in Solr with their hierarchy weight and lists of
     * related resources (hasPart & isPartOf)
     *
     * @throws SolrServerException
     * @throws MalformedURLException
     */
    private void updateDocumentHierarchy() throws SolrServerException, MalformedURLException, IOException {
        LOG.info(ResourceStructureGraph.printStatistics(0));
        Boolean updatedDocs = false;
        List<SolrInputDocument> updateDocs = new ArrayList<>();
        Iterator<CmdiVertex> vertexIter = ResourceStructureGraph.getFoundVertices().iterator();
        while (vertexIter.hasNext()) {
            CmdiVertex vertex = vertexIter.next();
            List<String> incomingVertexNames = ResourceStructureGraph.getIncomingVertexNames(vertex);
            List<String> outgoingVertexNames = ResourceStructureGraph.getOutgoingVertexNames(vertex);

            // update vertex if changes are necessary (necessary if non-default weight or edges to other resources)
            if (vertex.getHierarchyWeight() != 0 || !incomingVertexNames.isEmpty()
                    || !outgoingVertexNames.isEmpty()) {
                updatedDocs = true;
                SolrInputDocument doc = new SolrInputDocument();
                doc.setField(FacetConstants.FIELD_ID, Arrays.asList(vertex.getId()));

                if (vertex.getHierarchyWeight() != 0) {
                    Map<String, Integer> partialUpdateMap = new HashMap<>();
                    partialUpdateMap.put("set", Math.abs(vertex.getHierarchyWeight()));
                    doc.setField(FacetConstants.FIELD_HIERARCHY_WEIGHT, partialUpdateMap);
                }

                // remove vertices that were not imported
                Iterator<String> incomingVertexIter = incomingVertexNames.iterator();
                while (incomingVertexIter.hasNext()) {
                    String vertexId = incomingVertexIter.next();
                    if (ResourceStructureGraph.getVertex(vertexId) == null
                            || !ResourceStructureGraph.getVertex(vertexId).getWasImported()) {
                        incomingVertexIter.remove();
                    }
                }
                Iterator<String> outgoingVertexIter = outgoingVertexNames.iterator();
                while (outgoingVertexIter.hasNext()) {
                    String vertexId = outgoingVertexIter.next();
                    if (ResourceStructureGraph.getVertex(vertexId) == null
                            || !ResourceStructureGraph.getVertex(vertexId).getWasImported()) {
                        outgoingVertexIter.remove();
                    }
                }

                if (!incomingVertexNames.isEmpty()) {
                    Map<String, List<String>> partialUpdateMap = new HashMap<>();
                    partialUpdateMap.put("set", incomingVertexNames);
                    doc.setField(FacetConstants.FIELD_HAS_PART, partialUpdateMap);

                    Map<String, Integer> partialUpdateMapCount = new HashMap<>();
                    partialUpdateMapCount.put("set", incomingVertexNames.size());
                    doc.setField(FacetConstants.FIELD_HAS_PART_COUNT, partialUpdateMapCount);

                    // add hasPartCount weight
                    Double hasPartCountWeight = Math.log10(1 + Math.min(50, incomingVertexNames.size()));
                    Map<String, Double> partialUpdateMapCountWeight = new HashMap<>();
                    partialUpdateMapCountWeight.put("set", hasPartCountWeight);
                    doc.setField(FacetConstants.FIELD_HAS_PART_COUNT_WEIGHT, partialUpdateMapCountWeight);
                }

                if (!outgoingVertexNames.isEmpty()) {
                    Map<String, List<String>> partialUpdateMap = new HashMap<>();
                    partialUpdateMap.put("set", outgoingVertexNames);
                    doc.setField(FacetConstants.FIELD_IS_PART_OF, partialUpdateMap);
                }
                updateDocs.add(doc);
            }

            if (updateDocs.size() == config.getMaxDocsInList()) {
                solrServer.add(updateDocs);
                if (serverError != null) {
                    throw new SolrServerException(serverError);
                }
                updateDocs = new ArrayList<>();
            }
        }
        if (!updateDocs.isEmpty()) {
            solrServer.add(updateDocs);
            if (serverError != null) {
                throw new SolrServerException(serverError);
            }
        }

        if (updatedDocs) {
            solrServer.commit();
        }

        ResourceStructureGraph.clearResourceGraph();
    }

    /**
     * Update "days since last import" field for all Solr records of dataRoot.
     * Notice that it will not touch records that have a "last seen" value newer
     * than today. Therefore this should be called <em>after</em> normal 
     * processing of data root!
     *
     * @param dataRoot
     * @throws SolrServerException
     * @throws IOException
     */
    private void updateDaysSinceLastImport(DataRoot dataRoot) throws SolrServerException, IOException {
        LOG.info("Updating \"days since last import\" in Solr for: {}", dataRoot.getOriginName());

        SolrQuery query = new SolrQuery();
        query.setQuery(
                //we're going to process all records in the current data root...
                FacetConstants.FIELD_DATA_PROVIDER + ":" + ClientUtils.escapeQueryChars(dataRoot.getOriginName())
                        + " AND "
                        // ...that have a "last seen" value _older_ than today (on update/initialisation all records get 0 so we can skip the rest)
                        + FacetConstants.FIELD_LAST_SEEN + ":[* TO NOW-1DAY]");
        query.setFields(FacetConstants.FIELD_ID, FacetConstants.FIELD_LAST_SEEN);
        int fetchSize = 1000;
        query.setRows(fetchSize);
        QueryResponse rsp = solrServer.query(query);

        final long totalResults = rsp.getResults().getNumFound();
        final LocalDate nowDate = LocalDate.now();

        final int docsListSize = config.getMaxDocsInList();
        List<SolrInputDocument> updateDocs = new ArrayList<>(docsListSize);

        Boolean updatedDocs = false;
        int offset = 0;

        while (offset < totalResults) {
            query.setStart(offset);
            query.setRows(fetchSize);

            for (SolrDocument doc : solrServer.query(query).getResults()) {
                updatedDocs = true;

                String recordId = (String) doc.getFieldValue(FacetConstants.FIELD_ID);
                Date lastImportDate = (Date) doc.getFieldValue(FacetConstants.FIELD_LAST_SEEN);
                LocalDate oldDate = lastImportDate.toInstant().atZone(ZoneId.systemDefault()).toLocalDate();
                long daysSinceLastSeen = DAYS.between(oldDate, nowDate);

                SolrInputDocument updateDoc = new SolrInputDocument();
                updateDoc.setField(FacetConstants.FIELD_ID, recordId);

                Map<String, Long> partialUpdateMap = new HashMap<>();
                partialUpdateMap.put("set", daysSinceLastSeen);
                updateDoc.setField(FacetConstants.FIELD_DAYS_SINCE_LAST_SEEN, partialUpdateMap);

                updateDocs.add(updateDoc);

                if (updateDocs.size() == docsListSize) {
                    solrServer.add(updateDocs);
                    if (serverError != null) {
                        throw new SolrServerException(serverError);
                    }
                    updateDocs = new ArrayList<>(docsListSize);
                }
            }
            offset += fetchSize;
            LOG.info("Updating \"days since last import\": {} out of {} records updated", offset, totalResults);
        }

        if (!updateDocs.isEmpty()) {
            solrServer.add(updateDocs);
            if (serverError != null) {
                throw new SolrServerException(serverError);
            }
        }

        if (updatedDocs) {
            solrServer.commit();
        }

        LOG.info("Updating \"days since last import\" done.");
    }

    public static VloConfig config;

    public static LanguageCodeUtils languageCodeUtils;

    //data roots passed from command line    
    private String clDatarootsList = null;

    /**
     * @param args
     * @throws MalformedURLException
     * @throws IOException
     */
    public static void main(String[] args) throws MalformedURLException, IOException {

        // path to the configuration file
        String configFile = null;

        // use the Apache cli framework for getting command line parameters
        Options options = new Options();

        // Data root list passed from command line with -l option
        String cldrList = null;

        /**
         * Add a "c" option, the option indicating the specification of an XML
         * configuration file
         *
         * "l" option - to specify which data roots (from config file) to import
         * imports all by default
         */
        options.addOption("c", true, "-c <file> : use parameters specified in <file>");
        options.addOption("l", true,
                "-l <dataroot> [ ' ' <dataroot> ]* :  space separated list of dataroots to be processed.\n"
                        + "If dataroot is not specified in config file it will be ignored.");
        options.getOption("l").setOptionalArg(true);

        CommandLineParser parser = new PosixParser();

        try {
            // parse the command line arguments
            CommandLine cmd = parser.parse(options, args);
            if (cmd.hasOption("c")) {

                // the "c" option was specified, now get its value
                configFile = cmd.getOptionValue("c");
            }

            if (cmd.hasOption("l")) {
                cldrList = cmd.getOptionValue("l");
            }

        } catch (org.apache.commons.cli.ParseException ex) {

            /**
             * Caught an exception caused by command line parsing. Try to get
             * the name of the configuration file by querying the system
             * property.
             */
            String message = "Command line parsing failed. " + ex.getMessage();
            LOG.error(message);
            System.err.println(message);
        }

        if (configFile == null) {

            String message;

            message = "Could not get config file name via the command line, trying the system properties.";
            LOG.info(message);

            String key;

            key = "configFile";
            configFile = System.getProperty(key);
        }

        if (configFile == null) {

            String message;

            message = "Could not get filename as system property either - stopping.";
            LOG.error(message);
        } else {
            // read the configuration from the externally supplied file
            final URL configUrl;
            if (configFile.startsWith("file:")) {
                configUrl = new URL(configFile);
            } else {
                configUrl = new File(configFile).toURI().toURL();
            }
            System.out.println("Reading configuration from " + configUrl.toString());
            LOG.info("Reading configuration from " + configUrl.toString());
            final XmlVloConfigFactory configFactory = new XmlVloConfigFactory(configUrl);
            MetadataImporter.config = configFactory.newConfig();
            MetadataImporter.languageCodeUtils = new LanguageCodeUtils(MetadataImporter.config);

            // optionally, modify the configuration here
            // create and start the importer
            MetadataImporter importer = new MetadataImporter(cldrList);
            importer.startImport();

            // finished importing
            if (MetadataImporter.config.printMapping()) {
                File file = new File("xsdMapping.txt");
                FacetMappingFactory.printMapping(file);
                LOG.info("Printed facetMapping in " + file);
            }
        }
    }
}