com.mycompany.mavenpails2.PailMove.java Source code

Introduction

Here is the source code for com.mycompany.mavenpails2.PailMove.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.mycompany.mavenpails2;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.tap.Tap;
import cascalog.ops.IdentityBuffer;
import cascalog.ops.RandLong;
import clojure.lang.Keyword;
import clojure.lang.PersistentStructMap;
import backtype.cascading.tap.PailTap;
import backtype.cascading.tap.PailTap.PailTapOptions;
import backtype.hadoop.pail.Pail;
import backtype.hadoop.pail.PailSpec;
import backtype.hadoop.pail.PailStructure;
import com.mycompany.mavenpails2.Data;
import com.mycompany.mavenpails2.DataUnit;
import com.twitter.maple.tap.StdoutTap;
import elephantdb.DomainSpec;
import elephantdb.generated.DomainNotFoundException;
import elephantdb.generated.DomainNotLoadedException;
import elephantdb.generated.HostsDownException;
import elephantdb.generated.keyval.ElephantDB;
import elephantdb.jcascalog.EDB;
import elephantdb.partition.HashModScheme;
import elephantdb.persistence.JavaBerkDB;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import jcascalog.Api;
import jcascalog.Fields;
import jcascalog.Option;
import jcascalog.Subquery;
import jcascalog.op.Count;
import jcascalog.op.LT;
import jcascalog.op.GT;
import jcascalog.op.Sum;
import jcascalog.op.Max;
import jcascalog.op.Avg;
import jcascalog.op.Equals;
import jcascalog.op.Min;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.thrift.TException;

/**
 *
 * @author fedora
 */
public class PailMove {

    public static final String TEMP_DIR = "/tmp/swa";
    public static final String NEW_DATA_LOCATION = "/tmp/newData";
    public static final String MASTER_DATA_LOCATION = "/tmp/masterData";
    public static final String SNAPSHOT_LOCATION = "/tmp/swa/newDataSnapshot";
    public static final String SHREDDED_DATA_LOCATION = "/tmp/swa/shredded";

    public static void mergeData(String masterDir, String updateDir) throws IOException {
        Pail target = new Pail(masterDir);
        Pail source = new Pail(updateDir);
        target.absorb(source);
        target.consolidate();
    }

    public static void setApplicationConf() throws IOException {
        Map conf = new HashMap();
        String sers = "backtype.hadoop.ThriftSerialization,org.apache.hadoop.io.serializer.WritableSerialization";
        conf.put("io.serializations", sers);
        Api.setApplicationConf(conf);

        FileSystem fs = FileSystem.get(new Configuration());
        fs.delete(new Path(TEMP_DIR), true);
        fs.mkdirs(new Path(TEMP_DIR));
        /* Configuration conf2 = new Configuration();
        FileSystem fs = FileSystem.get(conf2);
        fs.delete(new Path(TEMP_DIR), true);
        fs.mkdirs(new Path(TEMP_DIR)); */
    }

    public static void ingest(Pail masterPail, Pail newDataPail) throws IOException {

        Pail snapshotPail = newDataPail.snapshot(SNAPSHOT_LOCATION);
        //shred();
        appendNewData(masterPail, snapshotPail);
        //consolidateAndAbsord(masterPail, new Pail(SHREDDED_DATA_LOCATION));
        newDataPail.deleteSnapshot(snapshotPail);
    }

    /* private static void consolidateAndAbsord(Pail masterPail, Pail shreddedPail) throws IOException {
    shreddedPail.consolidate();
    masterPail.absorb(shreddedPail);
    } */

    public static PailTap attributeTap(String path, final Data._Fields... fields) {
        PailTap.PailTapOptions opts = new PailTap.PailTapOptions();
        opts.attrs = new List[] { new ArrayList<String>() {
            {
                for (Data._Fields field : fields) {
                    add("" + field.getThriftFieldId());
                }
            }
        } };
        opts.spec = new PailSpec((PailStructure) new SplitDataPailStructure());
        return new PailTap(path, opts);
    }

    public static PailTap splitDataTap(String path) {
        PailTap.PailTapOptions opts = new PailTap.PailTapOptions();
        opts.spec = new PailSpec((PailStructure) new SplitDataPailStructure());
        return new PailTap(path, opts);
    }

    public static PailTap deserializeDataTap(String path) {
        PailTap.PailTapOptions opts = new PailTap.PailTapOptions();
        opts.spec = new PailSpec(new DataPailStructure());
        return new PailTap(path, opts);
    }

    public static Pail shred() throws IOException {
        PailTap source = deserializeDataTap(SNAPSHOT_LOCATION);
        PailTap sink = splitDataTap(SHREDDED_DATA_LOCATION);
        Subquery reduced = new Subquery("?rand", "?data").predicate(source, "_", "?data-in")
                .predicate(new RandLong()).out("?rand").predicate(new IdentityBuffer(), "?data-in").out("?data");
        Api.execute(sink, new Subquery("?data").predicate(reduced, "_", "?data"));

        Pail shreddedPail = new Pail(SHREDDED_DATA_LOCATION);
        shreddedPail.consolidate();
        return shreddedPail;

    }

    public static void appendNewData(Pail masterPail, Pail snapshotPail) throws IOException {
        Pail shreddedPail = shred();
        masterPail.absorb(shreddedPail);
    }

    /*
    Estos joins son idnticos en todas las batch views, por lo que se separa
    para calcularlo solamente una vez.
    */
    public static Subquery FactsJoin() {
        PailTap masterData = splitDataTap("/tmp/masterData");
        Subquery a = new Subquery("?id", "?time", "?value").predicate(masterData, "_", "?data")
                .predicate(new ExtractValueFields(), "?data").out("?id", "?value", "?time");
        Subquery b = new Subquery("?id", "?tipo", "?time").predicate(masterData, "_", "?data")
                .predicate(new ExtractTypeField(), "?data").out("?id", "?tipo", "?time");
        Subquery x = new Subquery("?id", "?value", "?tipo", "?time").predicate(a, "?id", "?time", "?value")
                .predicate(b, "?id", "?tipo", "?time");
        return x;
    }

    /*
     Clculo de la batch view de los termometros
     */
    public static Subquery TempBatchView() {
        /*
         Esta Query toma el batch join previamente hecho y calcula la 
         Batch View de los termmetros
         */
        Object source = Api.hfsSeqfile("/tmp/joins");
        Subquery y = new Subquery("?id", "?value", "?tipo", "?time")
                .predicate(source, "?id", "?value", "?tipo", "?time")
                .predicate(new Equals(), "?tipo", "Termometro");
        Subquery z = new Subquery("?id", "?twenty", "?avg").predicate(y, "?id", "?value", "?tipo", "?time")
                .predicate(new ToTwentyBucket(), "?time").out("?twenty").predicate(new Avg(), "?value").out("?avg");

        return z;
    }

    /*
    Clculo de la batch view de promedios de los anemmetros.
    */
    public static Subquery AnemBatchView1() {
        Object source = Api.hfsSeqfile("/tmp/joins");
        Subquery y = new Subquery("?id", "?value", "?tipo", "?time")
                .predicate(source, "?id", "?value", "?tipo", "?time")
                .predicate(new Equals(), "?tipo", "Anemometro");
        Subquery z = new Subquery("?id", "?twenty", "?avg").predicate(y, "?id", "?value", "?tipo", "?time")
                .predicate(new ToTwentyBucket(), "?time").out("?twenty").predicate(new Avg(), "?value").out("?avg");
        return z;
    }

    /*
    Clculo de la batch view de peaks de los anemmetros
    */
    public static Subquery AnemBatchView2() {
        Object source = Api.hfsSeqfile("/tmp/joins");
        Subquery y = new Subquery("?id", "?value", "?tipo", "?time")
                .predicate(source, "?id", "?value", "?tipo", "?time")
                .predicate(new Equals(), "?tipo", "Anemometro");
        Subquery z = new Subquery("?id", "?ten", "?max").predicate(y, "?id", "?value", "?tipo", "?time")
                .predicate(new ToTenBucket(), "?time").out("?ten").predicate(new Max(), "?value").out("?max");
        return z;
    }

    /*
    Clculo de las batch views de los acelermetros.
    */
    public static Subquery AccelBatchView1() {
        Object source = Api.hfsSeqfile("/tmp/joins");
        Subquery y = new Subquery("?id", "?value", "?tipo", "?time")
                .predicate(source, "?id", "?value", "?tipo", "?time")
                .predicate(new Equals(), "?tipo", "Acelerometro");
        Subquery z = new Subquery("?id", "?twenty", "?absmax").predicate(y, "?id", "?value", "?tipo", "?time")
                .predicate(new ToTwentyBucket(), "?time").out("?twenty").predicate(new AbsMax(), "?value")
                .out("?absmax");
        Subquery a = new Subquery("?id", "?gran", "?bucket", "?absmax2").predicate(z, "?id", "?twenty", "?absmax")
                .predicate(new EmitGranularitiesTwenty(), "?twenty").out("?gran", "?bucket")
                .predicate(new AbsMax(), "?absmax").out("?absmax2");
        return a;
    }

    public static void readPail() throws IOException {
        Pail<Data> datapail = new Pail<Data>("/tmp/masterData");
        for (Data d : datapail) {
            System.out.println(d.dataunit + " -> " + d.pedigree);
        }
    }

    public static void deleteFolder(File folder) {
        File[] files = folder.listFiles();
        if (files != null) { //some JVMs return null for empty dirs
            for (File f : files) {
                if (f.isDirectory()) {
                    deleteFolder(f);
                } else {
                    f.delete();
                }
            }
        }
        folder.delete();
    }

    public static void prepWork() {
        Object source = Api.hfsSeqfile("/tmp/joins");
        //Api.execute(new StdoutTap(), FactsJoin());
        Api.execute(source, FactsJoin());
    }
    // A PARTIR DE AQU?, SERVING LAYER

    public static void accelElephantDB(Subquery accel) {
        Subquery toEdb = new Subquery("?key", "?value")
                .predicate(accel, "?id", "?granularity", "?bucket", "?absmax")
                .predicate(new ToIdBucketedKey(), "?id", "?bucket").out("?key")
                .predicate(new ToSerializedInt(), "?absmax").out("?value");

        DomainSpec spec = new DomainSpec(new JavaBerkDB(), new HashModScheme(), 32);

        Object tap = EDB.makeKeyValTap("/tmp/outputs/acelerometros", spec);
        Api.execute(tap, toEdb);
    }

    public static void clientQuery(ElephantDB.Client client, String domain, ByteBuffer key) {
        try {
            client.get(domain, key);
        } catch (DomainNotFoundException | HostsDownException | DomainNotLoadedException | TException ex) {
            Logger.getLogger(PailMove.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static void main(String args[]) throws Exception {
        //Los primeros pasos son establecer la configuracin de Hadoop. 
        setApplicationConf();
        LocalFileSystem fs = FileSystem.getLocal(new Configuration());
        /* Luego creamos Los dos pails necesarios para alojar el masterdataset
        NewDataPail contendr los registros nuevos y stos se aadirn al 
        MasterPail, que es el master dataset.
        */
        Pail newDataPail;
        Pail masterPail;
        Path fils = new Path(NEW_DATA_LOCATION);
        if (!fs.exists(fils)) {
            newDataPail = Pail.create(FileSystem.get(new Configuration()), NEW_DATA_LOCATION,
                    new DataPailStructure());
        } else {
            newDataPail = new Pail<Data>(NEW_DATA_LOCATION);
        }
        if (!fs.exists(new Path(MASTER_DATA_LOCATION))) {
            masterPail = Pail.create(FileSystem.getLocal(new Configuration()), MASTER_DATA_LOCATION,
                    new SplitDataPailStructure());
        } else {
            masterPail = new Pail<Data>(MASTER_DATA_LOCATION);
        }

        /*
        La siguiente rutina toma un archivo de texto de la carpeta resources
        y la desmembra para insertar sus datos en el newDataPail.
        */

        Pail.TypedRecordOutputStream out = newDataPail.openWrite();
        PailMove c = new PailMove();
        Class cls = c.getClass();
        File file = new File(cls.getClassLoader().getResource("realdataset.txt").getFile());

        try (Scanner scanner = new Scanner(file)) {

            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                //result.append(line).append("\n");                       
                StringTokenizer tkn = new StringTokenizer(line);
                String sid = tkn.nextToken();
                String stime = tkn.nextToken();
                String stipo = tkn.nextToken();
                String seje1 = tkn.nextToken();
                String seje2 = tkn.nextToken();
                String selev = tkn.nextToken();
                String sx = tkn.nextToken();
                String sy = tkn.nextToken();
                String sz = tkn.nextToken();
                String svalue = tkn.nextToken();
                if (!tkn.hasMoreTokens()) {
                    long id = Long.parseLong(sid);
                    int time = Integer.parseInt(stime);
                    int tipo = Integer.parseInt(stipo);
                    int eje1 = Integer.parseInt(seje1);
                    int eje2 = Integer.parseInt(seje2);
                    int elev = Integer.parseInt(selev);
                    int posx = Integer.parseInt(sx);
                    int posy = Integer.parseInt(sy);
                    int posz = Integer.parseInt(sz);
                    double value = Double.parseDouble(svalue);

                    out.writeObject(GenerateData.setValue(id, time, value));
                    out.writeObject(GenerateData.setTipo(id, time, tipo));
                    out.writeObject(GenerateData.setPos(id, time, eje1, eje2, elev, posx, posy, posz));

                }
            }
            out.close();
            scanner.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        // shred();

        /* Ingest es una serie de pasos para finalmente insertar el newData en el
        master dataset, evitando duplicancia de datos y corrupcin.
        */
        ingest(masterPail, newDataPail);
        // Prepwork crea un join de datos con el fin de cclular las Batch Views
        prepWork();
        //Finalmente se calculan las Batch Views.

        //Object source = Api.hfsSeqfile("/tmp/accel");
        Api.execute(new StdoutTap(), TempBatchView());
        Api.execute(new StdoutTap(), AnemBatchView1());
        Api.execute(new StdoutTap(), AnemBatchView2());
        Api.execute(new StdoutTap(), AccelBatchView1());

        accelElephantDB(AccelBatchView1());

        /* Elminiamos el directorio temporal joins
           (Debe haber una mejor forma de hacer esto)
        */
        File index = new File("/tmp/joins");
        deleteFolder(index);

        /* CassandraDatastax client = new CassandraDatastax();
         client.connect("127.0.0.1");
            
         client.loadData();
         client.close(); */

    }

}