org.apache.pig.test.TestLocalPOSplit.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.test.TestLocalPOSplit.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.test;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;

import junit.framework.TestCase;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.ExecType;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.pen.LocalLogToPhyTranslationVisitor;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LODefine;
import org.apache.pig.impl.logicalLayer.LOLoad;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.LogicalPlanBuilder;
import org.apache.pig.impl.logicalLayer.PlanSetter;
import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor;
import org.apache.pig.impl.plan.CompilationMessageCollector;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.pen.util.FunctionalLogicalOptimizer;
import org.junit.Test;

public class TestLocalPOSplit extends TestCase {

    Random r = new Random();

    Log log = LogFactory.getLog(getClass());

    PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties());

    @Test
    public void testSplit() throws IOException, VisitorException, ExecException {
        init();

        pigContext.connect();
        File datFile = File.createTempFile("tempA", ".dat");

        FileOutputStream dat = new FileOutputStream(datFile);

        for (int i = 0; i < 100; i++) {
            String str = r.nextInt(10) + "\n";
            dat.write(str.getBytes());

        }

        dat.close();

        String query = "split (load '" + Util.encodeEscape(datFile.getAbsolutePath())
                + "') into a if $0 == 2, b if $0 == 9, c if $0 == 7 ;";

        LogicalPlan plan = buildPlan(query);
        PhysicalPlan pp = buildPhysicalPlan(plan);

        DataBag[] bag = new DataBag[pp.getLeaves().size()];

        for (int i = 0; i < bag.length; i++) {
            bag[i] = BagFactory.getInstance().newDefaultBag();
        }

        for (int i = 0; i < pp.getLeaves().size(); i++) {
            Tuple t = null;
            for (Result res = pp.getLeaves().get(i).getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = pp
                    .getLeaves().get(i).getNext(t)) {
                if (res.returnStatus == POStatus.STATUS_OK)
                    bag[i].add((Tuple) res.result);
            }
        }

        // Depending on how the "maps" in the physical plan are
        // built the leaves could be in different order between different runs.
        // lets test the first tuple out of each leaf to 
        // 1) ensure the value was not seen before
        // 2) all the remaining tuples from that leaf are same
        //    as the first value
        Map<DataByteArray, Boolean> seen = new HashMap<DataByteArray, Boolean>();
        seen.put(new DataByteArray("7".getBytes()), false);
        seen.put(new DataByteArray("9".getBytes()), false);
        seen.put(new DataByteArray("2".getBytes()), false);

        for (int i = 0; i < bag.length; i++) {
            DataByteArray firstValue = null;
            Iterator<Tuple> it = bag[i].iterator();
            if (it.hasNext()) {
                // check that we have not seen this value before
                Tuple t = it.next();
                System.out.println(t);
                firstValue = (DataByteArray) t.get(0);
                assertFalse((Boolean) seen.get(firstValue));
                seen.put(firstValue, true);

            }
            // check that all remaining tuples from this 
            // leaf have the same values as the first value
            for (; it.hasNext();) {
                Tuple t = it.next();
                System.out.println(t);
                assertEquals(t.get(0), firstValue);
            }
        }
    }

    @Test
    public void testSplitNulls() throws IOException, VisitorException, ExecException {
        init();

        pigContext.connect();
        File datFile = File.createTempFile("tempN1", ".dat");
        String path1 = Util.encodeEscape(datFile.getAbsolutePath());

        FileOutputStream dat = new FileOutputStream(datFile);
        String s1 = "1\n2\n3\n42\n4\n5\n";
        dat.write(s1.getBytes());
        dat.close();

        String s2 = "1\n2\n43\n3\n4\n5\n";

        datFile = File.createTempFile("tempN2", ".dat");
        String path2 = Util.encodeEscape(datFile.getAbsolutePath());

        dat = new FileOutputStream(datFile);
        dat.write(s2.getBytes());
        dat.close();

        String query = "a = load '" + path1 + "'; b = load '" + path2 + "'; "
                + "c = cogroup a by $0, b by $0; d = foreach c generate $0, flatten($1), flatten($2); "
                + "split d into e if 1==1, f if 1==1;";

        LogicalPlan plan = buildPlan(query);
        PhysicalPlan pp = buildPhysicalPlan(plan);

        DataBag[] bag = new DataBag[pp.getLeaves().size()];

        for (int i = 0; i < bag.length; i++) {
            bag[i] = BagFactory.getInstance().newDefaultBag();
        }

        for (int i = 0; i < pp.getLeaves().size(); i++) {
            System.out.println("Leaves: " + i);
            Tuple t = null;
            for (Result res = pp.getLeaves().get(i).getNext(t); res.returnStatus != POStatus.STATUS_EOP; res = pp
                    .getLeaves().get(i).getNext(t)) {
                if (res.returnStatus == POStatus.STATUS_OK)
                    bag[i].add((Tuple) res.result);
                System.out.println("Split: " + res.result);
            }
        }

        Map<DataByteArray, Integer> seen = new HashMap<DataByteArray, Integer>();
        seen.put(new DataByteArray("1".getBytes()), new Integer(0));
        seen.put(new DataByteArray("2".getBytes()), new Integer(0));
        seen.put(new DataByteArray("3".getBytes()), new Integer(0));
        seen.put(new DataByteArray("4".getBytes()), new Integer(0));
        seen.put(new DataByteArray("5".getBytes()), new Integer(0));

        for (int i = 0; i < bag.length; i++) {
            DataByteArray value = null;
            Iterator<Tuple> it = bag[i].iterator();
            while (it.hasNext()) {
                Tuple t = it.next();
                System.out.println("Value: " + t);
                value = (DataByteArray) t.get(0);
                Integer count = seen.get(value);
                seen.put(value, ++count);
            }
        }

        for (Integer j : seen.values()) {
            assertEquals(j, new Integer(2));
        }
    }

    public PhysicalPlan buildPhysicalPlan(LogicalPlan lp) throws VisitorException {
        LocalLogToPhyTranslationVisitor visitor = new LocalLogToPhyTranslationVisitor(lp);
        visitor.setPigContext(pigContext);
        visitor.visit();
        return visitor.getPhysicalPlan();
    }

    public LogicalPlan buildPlan(String query) {
        return buildPlan(query, LogicalPlanBuilder.class.getClassLoader());
    }

    public LogicalPlan buildPlan(String query, ClassLoader cldr) {
        LogicalPlanBuilder.classloader = cldr;

        LogicalPlanBuilder builder = new LogicalPlanBuilder(pigContext); //

        try {
            String[] qs = query.split(";");
            LogicalPlan lp = null;
            for (String q : qs) {
                q = q.trim();
                if (q.equals(""))
                    continue;
                q += ";";
                System.out.println(q);
                lp = builder.parse("Test-Plan-Builder", q, aliases, logicalOpTable, aliasOp, fileNameMap);
            }

            List<LogicalOperator> roots = lp.getRoots();

            if (roots.size() > 0) {
                for (LogicalOperator op : roots) {
                    if (!(op instanceof LOLoad) && !(op instanceof LODefine)) {
                        throw new Exception("Cannot have a root that is not the load or define operator. Found "
                                + op.getClass().getName());
                    }
                }
            }

            System.err.println("Query: " + query);

            // Just the top level roots and their children
            // Need a recursive one to travel down the tree

            for (LogicalOperator op : lp.getRoots()) {
                System.err.println("Logical Plan Root: " + op.getClass().getName() + " object " + op);

                List<LogicalOperator> listOp = lp.getSuccessors(op);

                if (null != listOp) {
                    Iterator<LogicalOperator> iter = listOp.iterator();
                    while (iter.hasNext()) {
                        LogicalOperator lop = iter.next();
                        System.err.println("Successor: " + lop.getClass().getName() + " object " + lop);
                    }
                }
            }
            lp = refineLogicalPlan(lp);
            assertTrue(lp != null);
            return lp;
        } catch (IOException e) {
            // log.error(e);
            // System.err.println("IOException Stack trace for query: " +
            // query);
            // e.printStackTrace();
            fail("IOException: " + e.getMessage());
        } catch (Exception e) {
            log.error(e);
            // System.err.println("Exception Stack trace for query: " + query);
            // e.printStackTrace();
            fail(e.getClass().getName() + ": " + e.getMessage() + " -- " + query);
        }
        return null;
    }

    private LogicalPlan refineLogicalPlan(LogicalPlan plan) {
        PlanSetter ps = new PlanSetter(plan);
        try {
            ps.visit();

        } catch (VisitorException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        // run through validator
        CompilationMessageCollector collector = new CompilationMessageCollector();
        FrontendException caught = null;
        try {
            boolean isBeforeOptimizer = true;
            LogicalPlanValidationExecutor validator = new LogicalPlanValidationExecutor(plan, pigContext,
                    isBeforeOptimizer);
            validator.validate(plan, collector);

            FunctionalLogicalOptimizer optimizer = new FunctionalLogicalOptimizer(plan);
            optimizer.optimize();

            isBeforeOptimizer = false;
            validator = new LogicalPlanValidationExecutor(plan, pigContext, isBeforeOptimizer);
            validator.validate(plan, collector);
        } catch (FrontendException fe) {
            // Need to go through and see what the collector has in it. But
            // remember what we've caught so we can wrap it into what we
            // throw.
            caught = fe;
        }

        return plan;

    }

    private void init() {
        aliases = new HashMap<LogicalOperator, LogicalPlan>();
        logicalOpTable = new HashMap<OperatorKey, LogicalOperator>();
        aliasOp = new HashMap<String, LogicalOperator>();
        fileNameMap = new HashMap<String, String>();
    }

    Map<LogicalOperator, LogicalPlan> aliases;
    Map<OperatorKey, LogicalOperator> logicalOpTable;
    Map<String, LogicalOperator> aliasOp;
    Map<String, String> fileNameMap;
}