com.marklogic.contentpump.DatabaseContentWriter.java Source code

Java tutorial

Introduction

Here is the source code for com.marklogic.contentpump.DatabaseContentWriter.java

Source

/*
 * Copyright 2003-2016 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.marklogic.mapreduce.ContentOutputFormat;
import com.marklogic.mapreduce.MarkLogicCounter;
import com.marklogic.mapreduce.ContentType;
import com.marklogic.mapreduce.ContentWriter;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.MarkLogicDocument;
import com.marklogic.mapreduce.utilities.AssignmentManager;
import com.marklogic.mapreduce.utilities.InternalUtilities;
import com.marklogic.xcc.AdhocQuery;
import com.marklogic.xcc.Content;
import com.marklogic.xcc.ContentCreateOptions;
import com.marklogic.xcc.ContentFactory;
import com.marklogic.xcc.ContentPermission;
import com.marklogic.xcc.ContentSource;
import com.marklogic.xcc.Session;
import com.marklogic.xcc.Session.TransactionMode;
import com.marklogic.xcc.exceptions.RequestException;

/**
 * MarkLogicRecordWriter that can 
 * 1) insert content from Archive to MarkLogic Server
 * 2) copy content from source MarkLogic Server to destination MarkLogic Server
 * 
 * @author ali
 * 
 */
public class DatabaseContentWriter<VALUE> extends ContentWriter<VALUE> implements MarkLogicConstants {
    public static final Log LOG = LogFactory.getLog(DatabaseContentWriter.class);

    private URIMetadata[][] metadatas;

    protected boolean isCopyProps;
    protected boolean isCopyColls;
    protected boolean isCopyPerms;
    protected boolean isCopyQuality;
    public static final String XQUERY_VERSION_1_0_ML = "xquery version \"1.0-ml\";\n";

    public DatabaseContentWriter(Configuration conf, Map<String, ContentSource> forestSourceMap, boolean fastLoad) {
        this(conf, forestSourceMap, fastLoad, null);
    }

    public DatabaseContentWriter(Configuration conf, Map<String, ContentSource> forestSourceMap, boolean fastLoad,
            AssignmentManager am) {
        super(conf, forestSourceMap, fastLoad, am);

        if (countBased) {
            metadatas = new URIMetadata[1][batchSize];
        } else {
            metadatas = new URIMetadata[forestIds.length][batchSize];
        }
        isCopyProps = conf.getBoolean(ConfigConstants.CONF_COPY_PROPERTIES, true);
        isCopyPerms = conf.getBoolean(ConfigConstants.CONF_COPY_PERMISSIONS, true);
        isCopyColls = conf.getBoolean(ConfigConstants.CONF_COPY_COLLECTIONS, true);
        isCopyQuality = conf.getBoolean(ConfigConstants.CONF_COPY_QUALITY, true);
    }

    /**
     * fetch the options information from conf and metadata, set to the field
     * "options"
     */
    protected ContentCreateOptions newContentCreateOptions(DocumentMetadata meta) {
        ContentCreateOptions opt = (ContentCreateOptions) options.clone();
        if (meta != null) {
            if (opt.getQuality() == 0) {
                opt.setQuality(meta.quality);
            }
            HashSet<String> colSet = new HashSet<String>(meta.collectionsList);
            if (opt.getCollections() != null) {
                // union copy_collection and output_collection
                for (String s : opt.getCollections()) {
                    colSet.add(s);
                }
            }
            opt.setCollections(colSet.toArray(new String[colSet.size()]));
            HashSet<ContentPermission> pSet = new HashSet<ContentPermission>(meta.permissionsList);
            if (opt.getPermissions() != null) {
                // union of output_permission & copy_permission
                for (ContentPermission p : opt.getPermissions()) {
                    pSet.add(p);
                }
            }
            opt.setPermissions(pSet.toArray(new ContentPermission[pSet.size()]));
        }
        return opt;
    }

    @Override
    public void write(DocumentURI key, VALUE value) throws IOException, InterruptedException {
        String uri = InternalUtilities.getUriWithOutputDir(key, outputDir);
        String csKey;
        int fId = 0;
        if (fastLoad) {
            if (!countBased) {
                // placement for legacy or bucket
                fId = am.getPlacementForestIndex(key);
                sfId = fId;
            } else {
                if (sfId == -1) {
                    sfId = am.getPlacementForestIndex(key);
                }
                fId = sfId;
            }
            csKey = forestIds[fId];
        } else {
            csKey = forestIds[hostId];
        }
        int sid = fId;

        Content content = null;
        DocumentMetadata meta = null;
        if (value instanceof DatabaseDocumentWithMeta) {
            meta = ((DatabaseDocumentWithMeta) value).getMeta();
            ContentCreateOptions opt = newContentCreateOptions(meta);
            MarkLogicDocument doc = (MarkLogicDocument) value;
            if (!meta.isNakedProps()) {
                opt.setFormat(doc.getContentType().getDocumentFormat());
                if (doc.getContentType() == ContentType.BINARY) {
                    content = ContentFactory.newContent(uri, doc.getContentAsByteArray(), opt);
                } else {
                    content = ContentFactory.newContent(uri, doc.getContentAsText().toString(), opt);
                }
            }
        } else {
            throw new UnsupportedOperationException(value.getClass() + " is not supported.");
        }
        if (countBased) {
            fId = 0;
        }
        pendingUris[sid].put(content, new DocumentURI(key));
        boolean inserted = false;
        if (batchSize > 1) {
            if (!meta.isNakedProps()) {
                // add new content
                forestContents[fId][counts[fId]] = content;
                metadatas[fId][counts[fId]++] = new URIMetadata(uri, meta);
            } else {
                // naked properties
                if (isCopyProps) {
                    if (sessions[sid] == null) {
                        sessions[sid] = getSession(csKey);
                    }
                    setDocumentProperties(uri, meta.getProperties(), isCopyPerms ? meta.getPermString() : null,
                            isCopyColls ? meta.getCollectionString() : null,
                            isCopyQuality ? meta.getQualityString() : null, sessions[sid]);
                    stmtCounts[sid]++;
                }
            }
            if (counts[fId] == batchSize) {
                if (sessions[sid] == null) {
                    sessions[sid] = getSession(csKey);
                }
                insertBatch(forestContents[fId], sid);
                stmtCounts[sid]++;
                if (isCopyProps) {
                    // insert properties
                    for (int i = 0; i < counts[fId]; i++) {
                        DocumentMetadata m = metadatas[fId][i].getMeta();
                        String u = metadatas[fId][i].getUri();
                        if (m != null && m.getProperties() != null) {
                            setDocumentProperties(u, m.getProperties(), null, null, null, sessions[sid]);
                            stmtCounts[sid]++;
                        }
                    }
                }
                //reset forest index for statistical
                if (countBased) {
                    sfId = -1;
                }
                counts[fId] = 0;
                inserted = true;
            }
        } else { // batchSize <= 1
            if (sessions[sid] == null) {
                sessions[sid] = getSession(csKey);
            }
            if (content != null) {
                insertContent(content, sid);
                stmtCounts[sid]++;
            }
            //reset forest index for statistical
            if (countBased) {
                sfId = -1;
            }
            if (isCopyProps && meta.getProperties() != null) {
                boolean naked = content == null;
                setDocumentProperties(uri, meta.getProperties(), isCopyPerms && naked ? meta.getPermString() : null,
                        isCopyColls && naked ? meta.getCollectionString() : null,
                        isCopyQuality && naked ? meta.getQualityString() : null, sessions[sid]);
                stmtCounts[sid]++;
            }
            inserted = true;
        }
        boolean committed = false;
        if (stmtCounts[sid] == txnSize && needCommit) {
            commit(sid);
            stmtCounts[sid] = 0;
            commitUris[sid].clear();
            committed = true;
        }
        if ((!fastLoad) && ((inserted && (!needCommit)) || committed)) {
            // rotate to next host and reset session
            hostId = (hostId + 1) % forestIds.length;
            sessions[0] = null;
        }
    }

    protected Session getSession(String forestId, TransactionMode mode) {
        Session session = null;
        ContentSource cs = forestSourceMap.get(forestId);
        if (fastLoad) {
            session = cs.newSession(forestId);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Connect to forest " + forestId + " on " + session.getConnectionUri().getHost());
            }
        } else {
            session = cs.newSession();
            if (LOG.isDebugEnabled()) {
                LOG.debug("Connect to " + session.getConnectionUri().getHost());
            }
        }
        session.setTransactionMode(mode);
        return session;
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        if (batchSize > 1) {
            int len, sid;
            if (countBased) {
                len = 1;
                sid = sfId;
            } else {
                len = fastLoad ? forestIds.length : 1;
                sid = 0;
            }
            if (isCopyProps) {
                for (int i = 0; i < len; i++, sid++) {
                    if (counts[i] > 0) {
                        Content[] remainder = new Content[counts[i]];
                        System.arraycopy(forestContents[i], 0, remainder, 0, counts[i]);
                        if (sessions[sid] == null) {
                            String forestId = forestIds[i];
                            sessions[sid] = getSession(forestId);
                        }
                        insertBatch(remainder, sid);
                        stmtCounts[sid]++;
                        if (!isCopyProps) {
                            continue;
                        }
                        for (int j = 0; j < counts[i]; j++) {
                            DocumentMetadata m = metadatas[i][j].getMeta();
                            String u = metadatas[i][j].getUri();
                            if (m != null && m.getProperties() != null) {
                                setDocumentProperties(u, m.getProperties(), null, null, null, sessions[sid]);
                                stmtCounts[sid]++;
                            }
                        }
                    }
                }
            }
        }
        for (int i = 0; i < sessions.length; i++) {
            if (sessions[i] != null) {
                if (stmtCounts[i] > 0 && needCommit) {
                    try {
                        sessions[i].commit();
                        succeeded += commitUris[i].size();
                    } catch (RequestException e) {
                        // log error and continue on RequestServerException
                        LOG.error("Error commiting transaction", e);
                        failed += commitUris[i].size();
                        for (DocumentURI failedUri : commitUris[i]) {
                            LOG.warn("Failed document " + failedUri);
                        }
                        commitUris[i].clear();
                    } finally {
                        sessions[i].close();
                    }
                } else {
                    sessions[i].close();
                }
            }
        }
        context.getCounter(MarkLogicCounter.OUTPUT_RECORDS_COMMITTED).increment(succeeded);
        context.getCounter(MarkLogicCounter.OUTPUT_RECORDS_FAILED).increment(failed);
    }

    /**
     * 
     * @param uri
     *            uri of the document whose property is to be set
     * @param xmlString
     *            property in xml string
     * @param forestId
     * @throws RequestException
     */
    protected void setDocumentProperties(String uri, String xmlString, String permString, String collString,
            String quality, Session s) {
        String query = XQUERY_VERSION_1_0_ML + "declare variable $URI as xs:string external;\n"
                + "declare variable $XML-STRING as xs:string external;\n"
                + "declare variable $PERM-STRING as xs:string external;\n"
                + "declare variable $COLL-STRING as xs:string external;\n"
                + "declare variable $QUALITY-STRING as xs:string external;\n"
                + "xdmp:document-set-properties($URI,\n" + "  xdmp:unquote($XML-STRING)/prop:properties/node() )\n"
                + ", if('' eq ($PERM-STRING)) then () else \n" + "xdmp:document-set-permissions($URI, \n"
                + "xdmp:unquote($PERM-STRING)/node()/sec:permission)\n"
                + ", if('' eq ($COLL-STRING)) then () else \n"
                + "let $f := fn:function-lookup(xs:QName('xdmp:from-json-string'), 1)\n"
                + "return if (fn:exists($f)) then \n"
                + "xdmp:document-set-collections($URI,json:array-values($f($COLL-STRING)))\n"
                + "else xdmp:document-set-collections($URI,json:array-values(xdmp:from-json($COLL-STRING)))\n"
                + ", if('' eq ($QUALITY-STRING)) then () else xdmp:document-set-quality($URI,xs:integer($QUALITY-STRING))\n";
        AdhocQuery req = s.newAdhocQuery(query);
        req.setNewStringVariable("URI", uri);
        req.setNewStringVariable("XML-STRING", xmlString);
        req.setNewStringVariable("PERM-STRING", permString == null ? "" : permString);
        req.setNewStringVariable("COLL-STRING", collString == null || collString.isEmpty() ? "" : collString);
        req.setNewStringVariable("QUALITY-STRING", quality == null ? "" : quality);
        try {
            s.submitRequest(req);
        } catch (RequestException ex) {
            LOG.error("Error setting document properties for " + uri, ex);
        }
    }
}

class URIMetadata {
    String uri;
    DocumentMetadata meta;

    public URIMetadata(String uri, DocumentMetadata meta) {
        super();
        this.uri = uri;
        this.meta = meta;
    }

    public String getUri() {
        return uri;
    }

    public DocumentMetadata getMeta() {
        return meta;
    }

}