com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils.java

Source

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.custom.utils;

import java.text.ParseException;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.bson.types.ObjectId;
import org.elasticsearch.cluster.metadata.IndexMetaData;

import com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.ikanow.infinit.e.data_model.utils.ThreadSafeSimpleDateFormat;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;

public class InfiniteElasticsearchHadoopUtils {

    public static void handleElasticsearchInput(CustomMapReduceJobPojo job, Configuration config,
            BasicDBObject advancedConfigurationDbo) {
        // Pull out type list:
        Object o = advancedConfigurationDbo.remove("$types");
        String[] types = null;
        if (null != o) {
            if (o instanceof BasicDBList) {
                types = ((BasicDBList) o).toArray(new String[0]);
            } else if (o instanceof String) {
                types = ((String) o).split("\\s*,\\s*");
            }
        } //TESTED (by hand)            

        //QUERY:

        // Date override:
        Date fromOverride = null;
        Date toOverride = null;
        Object fromOverrideObj = advancedConfigurationDbo.remove("$tmin");
        Object toOverrideObj = advancedConfigurationDbo.remove("$tmax");
        if (null != fromOverrideObj) {
            fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true);
        }
        if (null != toOverrideObj) {
            toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false);
        }
        Boolean streaming = null;
        Object streamingObj = advancedConfigurationDbo.remove("$streaming");
        if (streamingObj instanceof Boolean) {
            streaming = (Boolean) streamingObj;
        }

        //DEBUG
        //System.out.println("QUERY = " + advancedConfigurationDbo.toString());

        BasicDBObject newQuery = new BasicDBObject();
        Object queryObj = advancedConfigurationDbo.get("query");
        if (queryObj instanceof String) {
            config.set("es.query", queryObj.toString()); // URL version)         
            if ((null != fromOverride) || (null != toOverride)) {
                throw new RuntimeException(
                        "Can't specify $tmin/$tmax shortcut in conjunction with 'URL' query type");
            } //TESTED
        } else if (null != queryObj) {
            newQuery.put("query", queryObj);
            Object filterObj = advancedConfigurationDbo.get("filter");
            if (null != filterObj)
                newQuery.put("filter", filterObj); // (doesn't matter if it doesn't exist)
            Object fieldsObj = advancedConfigurationDbo.get("fields");
            if (null != fieldsObj)
                newQuery.put("fields", fieldsObj); // (doesn't matter if it doesn't exist)
            Object sizeObj = advancedConfigurationDbo.get("size");
            if (null != sizeObj)
                newQuery.put("size", sizeObj); // (doesn't matter if it doesn't exist)

            if ((null != fromOverride) || (null != toOverride)) {
                if (null == filterObj) {
                    BasicDBObject filterRangeParamsDbo = new BasicDBObject();
                    if (null != fromOverride) {
                        filterRangeParamsDbo.put("gte", fromOverride.getTime());
                    }
                    if (null != toOverride) {
                        filterRangeParamsDbo.put("lte", toOverride.getTime());
                    }
                    BasicDBObject filterRangeDbo = new BasicDBObject("@timestamp", filterRangeParamsDbo);
                    BasicDBObject filterDbo = new BasicDBObject("range", filterRangeDbo);
                    newQuery.put("filter", filterDbo);
                } else { // combine filter
                    throw new RuntimeException(
                            "Can't (currently) specify $tmin/$tmax shortcut in conjunction with filter");
                } //TESTED            
            }

            config.set("es.query", newQuery.toString());
        }
        //(else no query == match all)

        //COMMUNITIES

        Pattern dateRegex = null;
        ThreadSafeSimpleDateFormat tssdf = null;
        if ((null != fromOverride) || (null != toOverride)) {
            dateRegex = Pattern.compile("[0-9]{4}[.][0-9]{2}[.][0-9]{2}");
            tssdf = new ThreadSafeSimpleDateFormat("yyyy.MM.dd");
        } //TESTED

        StringBuffer overallIndexNames = new StringBuffer();
        for (ObjectId commId : job.communityIds) {
            StringBuffer indexNames = new StringBuffer();
            //TODO (INF-2641): need to handle:
            //c) anyway to sub-query?! (look for communityIds term?!)

            if (null == streaming) {
                indexNames.append("recs_*").append(commId.toString()).append("*");
            } else if (streaming) {
                indexNames.append("recs_t_").append(commId.toString()).append("*");
            } else {// !streaming
                indexNames.append("recs_").append(commId.toString());
            } //TESTED

            StringBuffer decomposedIndexes = new StringBuffer();
            boolean needDecomposedIndexes = false;

            HashSet<String> typesAdded = new HashSet<String>();
            if ((null != types) && (null == fromOverride) && (null == toOverride)) { // (types manual, no date filtering - can be much simpler)
                for (String s : types)
                    typesAdded.add(s);
            } else {
                // (All this oddly written code is to minimize the number of es types that get exposed, because
                //  they are really badly behaved in terms of bw compatbility)

                if (null != types) {
                    for (String s : types)
                        typesAdded.add(s);
                }

                ElasticSearchManager indexMgr = ElasticSearchManager.getIndex("doc_dummy"); // (index guaranteed to exist)
                Object[] indexMetaObj = indexMgr.getRawClient().admin().cluster().prepareState()
                        .setIndices(indexNames.toString()).setRoutingTable(false).setNodes(false)
                        .setListenerThreaded(false).get().getState().getMetaData().getIndices().values().toArray();

                if (null != indexMetaObj)
                    for (Object oo : indexMetaObj) {
                        IndexMetaData indexMeta = (IndexMetaData) oo;
                        String indexName = indexMeta.getIndex();

                        if ((null != fromOverride) || (null != toOverride)) {
                            //DEBUG
                            //System.out.println("INDEX: " + indexName);                  

                            Matcher m = dateRegex.matcher(indexName);
                            if (m.find()) {
                                try {
                                    Date d = tssdf.parse(m.group());
                                    long endpoint = d.getTime() + 24L * 3600L * 1000L - 1;
                                    //DEBUG
                                    //System.out.println("***************** COMPARE: " + d + " FROM " + fromOverride + " TO " + toOverride + "..errr . " + m.group());

                                    if (null != fromOverride) {
                                        if (endpoint < fromOverride.getTime()) { // no overlap on the left
                                            needDecomposedIndexes = true;
                                            continue;
                                        }
                                    } //TESTED
                                    if (null != toOverride) {
                                        if (d.getTime() > toOverride.getTime()) { // no overlap on the right
                                            needDecomposedIndexes = true;
                                            continue;
                                        }
                                    } //TESTED

                                } catch (ParseException e) {
                                    // just carry on, odd index name, it happens
                                    needDecomposedIndexes = true;
                                    continue;
                                }
                            }
                        } //TESTED (end loop over time checking)

                        if (null == types) {
                            Iterator<String> typesIt = indexMeta.getMappings().keysIt();
                            while (typesIt.hasNext()) {
                                String type = typesIt.next();
                                if (!type.equals("_default_")) {
                                    typesAdded.add(type);
                                }
                            }
                        }
                        if (0 != decomposedIndexes.length()) {
                            decomposedIndexes.append(',');
                        }
                        decomposedIndexes.append(indexName);

                    } //(end loop over indexes)
            } //(end if need to derive the types from the indexes)                

            if (needDecomposedIndexes) { // (because we filtered some indexes out)
                indexNames = decomposedIndexes;
            }
            if (0 == indexNames.length()) {
                continue; // nothing to do here...
            }

            int numTypesAdded = 0;
            if (typesAdded.isEmpty()) { // there doesn't seem to be any types associated with this set of indexes
                continue; // (ie don't add)
            } else
                for (String type : typesAdded) {
                    if (numTypesAdded > 0) {
                        indexNames.append(",");
                    } else {
                        indexNames.append("/");
                    }
                    numTypesAdded++;
                    indexNames.append(type);
                }

            if (overallIndexNames.length() > 0) {
                overallIndexNames.append(",,");
            }
            overallIndexNames.append(indexNames);

        } //(end loop over community)
          //TESTED (by hand)

        if (0 == overallIndexNames.length()) {
            throw new RuntimeException(
                    "Communities contained no types, either all indexes empty, or index is corrupt");
        } //TESTED (by hand)

        //DEBUG
        //System.out.println("INDEXES = " + overallIndexNames.toString());

        config.set("es.resource", overallIndexNames.toString());
        config.set("es.index.read.missing.as.empty", "yes");

        //proxy if running in debug mode:
        if (InfiniteEsInputFormat.LOCAL_DEBUG_MODE) {
            config.set("es.net.proxy.http.host", "localhost");
            config.set("es.net.proxy.http.port", "8888");
        } //TESTED (by hand)            

    }

}