com.marklogic.mapreduce.examples.ContentReader.java Source code

Java tutorial

Introduction

Here is the source code for com.marklogic.mapreduce.examples.ContentReader.java

Source

/*
 * Copyright 2003-2016 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce.examples;

import java.io.IOException;
import java.net.URI;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

import javax.net.ssl.KeyManager;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import com.marklogic.mapreduce.ContentType;
import com.marklogic.mapreduce.DocumentInputFormat;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.DatabaseDocument;
import com.marklogic.mapreduce.SslConfigOptions;

/**
 * Read documents from MarkLogic Server using an SSL-enabled connection and 
 * write them out to HDFS.  Used with config file 
 * conf/marklogic-docin-textout.xml.
 */
public class ContentReader {
    public static class DocMapper extends Mapper<DocumentURI, DatabaseDocument, DocumentURI, DatabaseDocument> {
        public static final Log LOG = LogFactory.getLog(DocMapper.class);

        public void map(DocumentURI key, DatabaseDocument value, Context context)
                throws IOException, InterruptedException {
            if (key != null && value != null) {
                context.write(key, value);
            } else {
                LOG.error("key: " + key + ", value: " + value);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        if (args.length < 2) {
            System.err.println("Usage: ContentReader configFile outputDir");
            System.exit(2);
        }
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        Job job = Job.getInstance(conf, "content reader");
        job.setJarByClass(ContentReader.class);
        job.setInputFormatClass(DocumentInputFormat.class);
        job.setMapperClass(DocMapper.class);
        job.setMapOutputKeyClass(DocumentURI.class);
        job.setMapOutputValueClass(DatabaseDocument.class);
        job.setOutputFormatClass(CustomOutputFormat.class);

        CustomOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        conf = job.getConfiguration();
        conf.addResource(otherArgs[0]);
        conf.setClass(MarkLogicConstants.INPUT_SSL_OPTIONS_CLASS, SslOptions.class, SslConfigOptions.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

    static class SslOptions implements SslConfigOptions {
        @Override
        public String[] getEnabledCipherSuites() {
            return new String[] { "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
                    "TLS_RSA_WITH_AES_256_CBC_SHA" };
        }

        @Override
        public String[] getEnabledProtocols() {
            return new String[] { "TLSv1" };
        }

        @Override
        public SSLContext getSslContext() {
            SSLContext sslContext = null;
            try {
                sslContext = SSLContext.getInstance("TLSv1");
            } catch (NoSuchAlgorithmException e) {
                e.printStackTrace();
            }
            TrustManager[] trustManagers = null;
            // Trust anyone.
            trustManagers = new TrustManager[] { new X509TrustManager() {
                public void checkClientTrusted(X509Certificate[] x509Certificates, String s)
                        throws CertificateException {
                    // nothing to do
                }

                public void checkServerTrusted(X509Certificate[] x509Certificates, String s)
                        throws CertificateException {
                    // nothing to do
                }

                public X509Certificate[] getAcceptedIssuers() {
                    return null;
                }
            } };

            KeyManager[] keyManagers = null;
            try {
                sslContext.init(keyManagers, trustManagers, null);
            } catch (KeyManagementException e) {
                e.printStackTrace();
            }
            return sslContext;
        }
    }

    static class CustomOutputFormat extends FileOutputFormat<DocumentURI, DatabaseDocument> {

        @Override
        public RecordWriter<DocumentURI, DatabaseDocument> getRecordWriter(TaskAttemptContext context)
                throws IOException, InterruptedException {
            return new CustomWriter(getOutputPath(context), context.getConfiguration());
        }
    }

    static class CustomWriter extends RecordWriter<DocumentURI, DatabaseDocument> {

        Path dir;
        Configuration conf;
        FileSystem fs;

        public CustomWriter(Path path, Configuration conf) {
            dir = path;
            this.conf = conf;
            try {
                fs = path.getFileSystem(conf);
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }

        @Override
        public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
        }

        @Override
        public void write(DocumentURI key, DatabaseDocument content) throws IOException, InterruptedException {
            Path path = null;
            try {
                URI uri = new URI(key.getUri());
                String pathname = uri.getPath();
                int nameStart = pathname.lastIndexOf('/');
                String filename = nameStart > 0 ? pathname.substring(pathname.lastIndexOf('/')) : pathname;
                String pathStr = dir.getName() + '/' + filename;
                path = new Path(pathStr);

                FSDataOutputStream out = fs.create(path, false);
                System.out.println("writing to: " + path);
                if (content.getContentType() == ContentType.BINARY) {
                    byte[] byteArray = content.getContentAsByteArray();
                    out.write(byteArray, 0, byteArray.length);
                    out.flush();
                    out.close();
                } else {
                    Text text = content.getContentAsText();
                    out.writeChars(text.toString());
                }
            } catch (Exception ex) {
                System.err.println("Failed to create or write to file: " + path);
                ex.printStackTrace();
            }
        }
    }
}