com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java

Source

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.custom.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.bson.types.ObjectId;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.hadoop.io.BSONWritable;

public class HadoopUtils {

    public static void deleteHadoopDir(CustomMapReduceJobPojo cmr)
            throws SAXException, IOException, ParserConfigurationException {
        PropertiesManager props = new PropertiesManager();
        Configuration conf = getConfiguration(props);
        Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(pathDir)) {
            fs.delete(pathDir, true);
        }
    }

    public static BasicDBList getBsonFromTextFiles(CustomMapReduceJobPojo cmr, int nLimit, String fields)
            throws IOException, SAXException, ParserConfigurationException {

        BasicDBList dbl = new BasicDBList();

        PropertiesManager props = new PropertiesManager();
        Configuration conf = getConfiguration(props);

        Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);
        FileSystem fs = FileSystem.get(conf);

        FileStatus[] files = fs.globStatus(new Path(pathDir.toString() + "/part-*"));
        for (FileStatus file : files) {
            if (file.getLen() > 0) {
                FSDataInputStream in = fs.open(file.getPath());
                BufferedReader bin = new BufferedReader(new InputStreamReader(in));
                for (;;) {
                    String s = bin.readLine();
                    if (null == s)
                        break;

                    String[] keyValue = s.split("\t", 2);
                    BasicDBObject dbo = new BasicDBObject();
                    if (keyValue.length > 1) {
                        dbo.put("key", keyValue[0]);
                        dbo.put("value", keyValue[1]);
                    } else {
                        dbo.put("value", keyValue[0]);
                    }
                    dbl.add(dbo);
                }
                in.close();
            }
        }
        return dbl;
    }//TESTED

    public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields)
            throws SAXException, IOException, ParserConfigurationException {

        BasicDBList dbl = new BasicDBList();

        PropertiesManager props = new PropertiesManager();
        Configuration conf = getConfiguration(props);

        Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);

        @SuppressWarnings({ "unchecked", "rawtypes" })
        SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable(
                pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf);

        // Very basic, only allow top level, 1 level of nesting, and field removal
        HashSet<String> fieldLookup = null;
        if (null != fields) {
            fieldLookup = new HashSet<String>();
            String[] fieldArray = fields.split(",");
            for (String field : fieldArray) {
                String[] fieldDecomp = field.split(":");
                fieldLookup.add(fieldDecomp[0]);
            }
        } //TOTEST

        int nRecords = 0;
        for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) {
            BasicDBObject element = new BasicDBObject();

            // KEY

            Writable key = record.getFirst();
            if (key instanceof org.apache.hadoop.io.Text) {
                org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key;
                element.put("key", writable.toString());
            } else if (key instanceof org.apache.hadoop.io.DoubleWritable) {
                org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key;
                element.put("key", Double.toString(writable.get()));
            } else if (key instanceof org.apache.hadoop.io.IntWritable) {
                org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key;
                element.put("key", Integer.toString(writable.get()));
            } else if (key instanceof org.apache.hadoop.io.LongWritable) {
                org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key;
                element.put("key", Long.toString(writable.get()));
            } else if (key instanceof BSONWritable) {
                element.put("key", MongoDbUtil.convert((BSONWritable) key));
            }

            // VALUE

            Writable value = record.getSecond();
            if (value instanceof org.apache.hadoop.io.Text) {
                org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value;
                element.put("value", writable.toString());
            } else if (value instanceof org.apache.hadoop.io.DoubleWritable) {
                org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value;
                element.put("value", Double.toString(writable.get()));
            } else if (value instanceof org.apache.hadoop.io.IntWritable) {
                org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value;
                element.put("value", Integer.toString(writable.get()));
            } else if (value instanceof org.apache.hadoop.io.LongWritable) {
                org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value;
                element.put("value", Long.toString(writable.get()));
            } else if (value instanceof BSONWritable) {
                element.put("value", MongoDbUtil.convert((BSONWritable) value));
            } else if (value instanceof org.apache.mahout.math.VectorWritable) {
                Vector vec = ((org.apache.mahout.math.VectorWritable) value).get();
                BasicDBList dbl2 = listFromMahoutVector(vec, "value", element);
                element.put("value", dbl2);
            } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) {
                org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value;
                element.put("valueWeight", vecW.getWeight());
                BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element);
                element.put("value", dbl2);
            } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) {
                Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue();
                BasicDBObject clusterVal = new BasicDBObject();
                clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal));
                clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal));
                element.put("value", clusterVal);
            } else {
                element.put("unknownValue", value.getClass().toString());
            }

            // Check the fields settings:
            // Only handle a few...
            if (null != fieldLookup) {
                for (String fieldToRemove : fieldLookup) {
                    if (fieldToRemove.startsWith("value.")) {
                        fieldToRemove = fieldToRemove.substring(6);
                        BasicDBObject nested = (BasicDBObject) element.get("value.");
                        if (null != nested) {
                            nested.remove(fieldToRemove);
                        }
                    } else {
                        element.remove(fieldToRemove);
                    }
                } //TOTEST
            }

            dbl.add(element);
            nRecords++;
            if ((nLimit > 0) && (nRecords >= nLimit)) {
                break;
            }
        }

        return dbl;
    }//TOTEST

    private static BasicDBList listFromMahoutVector(Vector vec, String prefix, BasicDBObject element) {
        if (vec instanceof NamedVector) {
            element.put(prefix + "Name", ((NamedVector) vec).getName());
        }
        BasicDBList dbl2 = new BasicDBList();
        if (vec.isDense()) {
            int nSize = vec.size();
            dbl2.ensureCapacity(nSize);
            for (int i = 0; i < nSize; ++i) {
                dbl2.add(vec.getQuick(i));
            }
        } else { // sparse, write as a set in the format [{int:double}]
            Iterator<org.apache.mahout.math.Vector.Element> elIt = vec.iterateNonZero();
            while (elIt.hasNext()) {
                BasicDBObject el2 = new BasicDBObject();
                org.apache.mahout.math.Vector.Element el = elIt.next();
                el2.put("k", el.index());
                el2.put("v", el.get());
                dbl2.add(el2);
            }
        }
        return dbl2;
    }

    /**
     * Returns an HDFS path for the custom task
     * @throws ParserConfigurationException 
     * @throws IOException 
     * @throws SAXException 
     * 
     */
    public static Path getPathForJob(CustomMapReduceJobPojo cmr, Configuration config, boolean bTemp)
            throws SAXException, IOException, ParserConfigurationException {
        // Get the name:
        StringBuffer sb = null;
        if (bTemp) {
            sb = new StringBuffer("in_progress/"); // (will move this after it's complete)
        } else {
            sb = new StringBuffer("completed/"); // (final location)         
        }
        for (ObjectId commId : cmr.communityIds) {
            sb.append(commId.toString()).append('_');
        }
        sb.append('/');
        sb.append(cmr.jobtitle).append('/');
        String pathName = sb.toString();

        return new Path(pathName);
    }//TOTEST

    public static Configuration getConfiguration(PropertiesManager prop_custom)
            throws SAXException, IOException, ParserConfigurationException {
        Configuration conf = new Configuration();
        if (prop_custom.getHadoopLocalMode()) {
            conf.set("fs.default.name", "local");
        } else {
            String fsUrl = getXMLProperty(prop_custom.getHadoopConfigPath() + "/hadoop/core-site.xml",
                    "fs.default.name");
            conf.set("fs.default.name", fsUrl);
        }
        return conf;
    }//TOTEST

    /**
     * Parses a given xml file and returns the requested value of propertyName.
     * The XML is expected to be in a format: <configuration><property><name>some.prop.name</name><value>some.value</value></property></configuration>
     * 
     * @param xmlFileLocation
     * @param propertyName
     * @return
     * @throws SAXException
     * @throws IOException
     * @throws ParserConfigurationException
     */
    public static String getXMLProperty(String xmlFileLocation, String propertyName)
            throws SAXException, IOException, ParserConfigurationException {
        File configFile = new File(xmlFileLocation);

        DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
        Document doc = docBuilder.parse(configFile);
        doc.getDocumentElement().normalize();

        NodeList listOfProps = doc.getElementsByTagName("property");

        for (int i = 0; i < listOfProps.getLength(); i++) {
            Node prop = listOfProps.item(i);
            if (prop.getNodeType() == Node.ELEMENT_NODE) {
                Element propElement = (Element) prop;
                NodeList name = propElement.getElementsByTagName("name").item(0).getChildNodes();
                Node nameValue = (Node) name.item(0);
                String nameString = nameValue.getNodeValue().trim();

                //found the correct property
                if (nameString.equals(propertyName)) {
                    //return the value
                    NodeList value = propElement.getElementsByTagName("value").item(0).getChildNodes();
                    Node valueValue = (Node) value.item(0);
                    String valueString = valueValue.getNodeValue().trim();
                    return valueString;
                }
            }
        }
        return null;
    }//TESTED

}