org.apache.pig.piggybank.test.storage.TestXMLLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.piggybank.test.storage.TestXMLLoader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
 * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */

package org.apache.pig.piggybank.test.storage;

import static org.apache.pig.ExecType.LOCAL;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import junit.framework.TestCase;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.data.Tuple;
import org.apache.pig.test.Util;
import org.w3c.dom.Document;

public class TestXMLLoader extends TestCase {
    public static ArrayList<String[]> data = new ArrayList<String[]>();
    static {
        data.add(new String[] { "<configuration>" });
        data.add(new String[] { "<property>" });
        data.add(new String[] { "<name> foobar </name>" });
        data.add(new String[] { "<value> barfoo </value>" });
        data.add(new String[] { "</property>" });
        data.add(new String[] { "<ignoreProperty>" });
        data.add(new String[] { "<name> foo </name>" });
        data.add(new String[] { "</ignoreProperty>" });
        data.add(new String[] { "<property>" });
        data.add(new String[] { "<name> justname </name>" });
        data.add(new String[] { "</property>" });
        data.add(new String[] { "</configuration>" });
    }

    public static ArrayList<String[]> nestedTags = new ArrayList<String[]>();
    static {
        nestedTags.add(new String[] { "<events>" });
        nestedTags.add(new String[] { "<event id='116913365'>" });
        nestedTags.add(new String[] { "<eventRank>1.000000000000</eventRank>" });
        nestedTags.add(new String[] { "<name>XY</name>" });
        nestedTags.add(new String[] { "<relatedEvents>" });
        nestedTags.add(new String[] { "<event id='116913365'>x</event>" });
        nestedTags.add(new String[] { "<event id='116913365'>y</event>" });
        nestedTags.add(new String[] { "</relatedEvents>" });
        nestedTags.add(new String[] { "</event>" });

        nestedTags.add(new String[] { "<event id='116913365'>" });
        nestedTags.add(new String[] { "<eventRank>3.0000</eventRank>" });
        nestedTags.add(new String[] { "<name>AB</name>" });
        nestedTags.add(new String[] { "<relatedEvents>" });
        nestedTags.add(new String[] { "<event id='116913365'>a</event>" });
        nestedTags.add(new String[] { "<event id='116913365'>b</event>" });
        nestedTags.add(new String[] { "</relatedEvents>" });
        nestedTags.add(new String[] { "</event>" });

        nestedTags.add(new String[] { "<event>" });
        nestedTags.add(new String[] { "<eventRank>4.0000</eventRank>" });
        nestedTags.add(new String[] { "<name>CD</name>" });
        nestedTags.add(new String[] { "<relatedEvents>" });
        nestedTags.add(new String[] { "<event>c</event>" });
        nestedTags.add(new String[] { "<event>d</event>" });
        nestedTags.add(new String[] { "</relatedEvents>" });
        nestedTags.add(new String[] { "</event>" });
        nestedTags.add(new String[] { "</events>" });
    }

    public static ArrayList<String[]> inlineClosedTags = new ArrayList<String[]>();
    static {
        inlineClosedTags.add(new String[] { "<events>" });
        inlineClosedTags.add(new String[] { "<event id='3423'/>" });
        inlineClosedTags.add(new String[] { "<event/>" });
        inlineClosedTags.add(new String[] { "<event><event/></event>" });
        inlineClosedTags.add(new String[] { "<event id='33'><tag k='a' v='b'/></event>" });
        inlineClosedTags.add(new String[] { "</events>" });
    }

    public static ArrayList<String[]> indentedXmlWithMultilineLineContent = new ArrayList<String[]>();
    static {
        indentedXmlWithMultilineLineContent.add(new String[] { "    <page>You have " });
        indentedXmlWithMultilineLineContent.add(new String[] { "not missed it</page>" });
    }

    public void testShouldReturn0TupleCountIfSearchTagIsNotFound() throws Exception {
        String filename = TestHelper.createTempFile(data, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('invalid') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(0, tupleCount);
    }

    public void testLoadXMLLoader() throws Exception {
        //ArrayList<DataByteArray[]> expected = TestHelper.getExpected(data, pattern);
        String filename = TestHelper.createTempFile(data, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(2, tupleCount);
    }

    public void testXMLLoaderShouldLoadBasicBzip2Files() throws Exception {
        String filename = TestHelper.createTempFile(data, "");
        Process bzipProc = Runtime.getRuntime().exec("bzip2 " + filename);
        int waitFor = bzipProc.waitFor();

        if (waitFor != 0) {
            fail("Failed to create the class");
        }

        filename = filename + ".bz2";

        try {
            PigServer pigServer = new PigServer(ExecType.LOCAL);
            String loadQuery = "A = LOAD '" + Util.encodeEscape(filename)
                    + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
            pigServer.registerQuery(loadQuery);

            Iterator<Tuple> it = pigServer.openIterator("A");
            int tupleCount = 0;
            while (it.hasNext()) {
                Tuple tuple = (Tuple) it.next();
                if (tuple == null)
                    break;
                else {
                    //TestHelper.examineTuple(expected, tuple, tupleCount);
                    if (tuple.size() > 0) {
                        tupleCount++;
                    }
                }
            }
            assertEquals(2, tupleCount);

        } finally {
            new File(filename).delete();
        }
    }

    public void testLoaderShouldLoadBasicGzFile() throws Exception {
        String filename = TestHelper.createTempFile(data, "");

        Process bzipProc = Runtime.getRuntime().exec("gzip " + filename);
        int waitFor = bzipProc.waitFor();

        if (waitFor != 0) {
            fail("Failed to create the class");
        }

        filename = filename + ".gz";

        try {
            PigServer pigServer = new PigServer(ExecType.LOCAL);
            String loadQuery = "A = LOAD '" + Util.encodeEscape(filename)
                    + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
            pigServer.registerQuery(loadQuery);

            Iterator<Tuple> it = pigServer.openIterator("A");
            int tupleCount = 0;
            while (it.hasNext()) {
                Tuple tuple = (Tuple) it.next();
                if (tuple == null)
                    break;
                else {
                    if (tuple.size() > 0) {
                        tupleCount++;
                    }
                }
            }
            assertEquals(2, tupleCount);

        } finally {
            new File(filename).delete();
        }
    }

    public void testXMLLoaderShouldNotConfusedWithTagsHavingSimilarPrefix() throws Exception {
        ArrayList<String[]> testData = new ArrayList<String[]>();
        testData.add(new String[] { "<namethisalso> foobar9 </namethisalso>" });
        testData.addAll(data);
        String filename = TestHelper.createTempFile(testData, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('name') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(3, tupleCount);
    }

    public void testShouldReturn1ForIntermediateTagData() throws Exception {
        String filename = TestHelper.createTempFile(data, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(1, tupleCount);
    }

    public void testShouldReturn0TupleCountIfNoEndTagIsFound() throws Exception {
        // modify the data content to avoid end tag for </ignoreProperty>
        ArrayList<String[]> testData = new ArrayList<String[]>();
        for (String content[] : data) {
            if (!content[0].equals("</ignoreProperty>")) {
                testData.add(content);
            }
        }

        String filename = TestHelper.createTempFile(testData, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(0, tupleCount);
    }

    public void testShouldReturn0TupleCountIfEmptyFileIsPassed() throws Exception {
        // modify the data content to avoid end tag for </ignoreProperty>
        ArrayList<String[]> testData = new ArrayList<String[]>();

        String filename = TestHelper.createTempFile(testData, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(0, tupleCount);
    }

    public void testXMLLoaderShouldSupportNestedTagWithSameName() throws Exception {
        String filename = TestHelper.createTempFile(nestedTags, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(3, tupleCount);
    }

    public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
        String filename = TestHelper.createTempFile(inlineClosedTags, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    tupleCount++;
                }
            }
        }
        assertEquals(4, tupleCount);
    }

    public void testXMLLoaderShouldWorkWithIndentedXmlWithMultilineContent() throws Exception {
        String filename = TestHelper.createTempFile(indentedXmlWithMultilineLineContent, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('page') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        int tupleCount = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                System.out.println(((String) tuple.get(0)));
                assertTrue(((String) tuple.get(0)).equals("<page>You have not missed it</page>"));
                tupleCount++;
            }
        }
        assertEquals(1, tupleCount);
    }

    public void testXMLLoaderShouldReturnValidXML() throws Exception {
        String filename = TestHelper.createTempFile(inlineClosedTags, "");
        PigServer pig = new PigServer(LOCAL);
        filename = filename.replace("\\", "\\\\");
        String query = "A = LOAD '" + filename
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                // Test it returns a valid XML
                DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
                docBuilder.parse(new ByteArrayInputStream(((String) tuple.get(0)).getBytes()));
            }
        }
    }

    /**
     * This test case test the special case when a non-matching tag spans two file
     * splits in a .bz2 compressed file. At the same time, the part that falls in
     * the first split is a prefix of the matching tag.
     * In other words, till the end of the first split, it looks like the tag is
     * matching but it is not actually matching.
     *
     * @throws Exception
     */
    public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
        Configuration conf = new Configuration();
        long blockSize = 100 * 1024;
        conf.setLong("fs.local.block.size", blockSize);

        String tagName = "event";

        PigServer pig = new PigServer(LOCAL, conf);
        FileSystem localFs = FileSystem.getLocal(conf);
        FileStatus[] testFiles = localFs
                .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
        assertTrue("No test files", testFiles.length > 0);
        for (FileStatus testFile : testFiles) {
            String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
            String query = "A = LOAD '" + testFileName
                    + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
            pig.registerQuery(query);
            Iterator<?> it = pig.openIterator("A");
            while (it.hasNext()) {
                Tuple tuple = (Tuple) it.next();
                if (tuple == null)
                    break;
                else {
                    if (tuple.size() > 0) {
                        assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">"));
                    }
                }
            }
        }
    }

    /**
     * This test checks that a multi-line tag spanning two splits should be
     * matched.
     * @throws Exception
     */
    public void testXMLLoaderShouldMatchTagSpanningSplits() throws Exception {
        Configuration conf = new Configuration();
        long blockSize = 512;
        conf.setLong("fs.local.block.size", blockSize);
        conf.setLong(MRConfiguration.MAX_SPLIT_SIZE, blockSize);

        String tagName = "event";
        File tempFile = File.createTempFile("long-file", ".xml");
        FileSystem localFs = FileSystem.getLocal(conf);
        FSDataOutputStream directOut = localFs.create(new Path(tempFile.getAbsolutePath()), true);

        String matchingElement = "<event>\ndata\n</event>\n";
        long pos = 0;
        int matchingCount = 0;
        PrintStream ps = new PrintStream(directOut);
        // 1- Write some elements that fit completely in the first block
        while (pos + 2 * matchingElement.length() < blockSize) {
            ps.print(matchingElement);
            pos += matchingElement.length();
            matchingCount++;
        }
        // 2- Write a long element that spans multiple lines and multiple blocks
        String longElement = matchingElement.replace("data",
                "data\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\n");
        ps.print(longElement);
        pos += longElement.length();
        matchingCount++;
        // 3- Write some more elements to fill in the second block completely
        while (pos < 2 * blockSize) {
            ps.print(matchingElement);
            pos += matchingElement.length();
            matchingCount++;
        }
        ps.close();

        PigServer pig = new PigServer(LOCAL, conf);
        String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\");
        String query = "A = LOAD '" + tempFileName
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");

        int count = 0;
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    count++;
                    // Make sure the returned text is a proper XML element
                    DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
                    Document doc = docBuilder.parse(new ByteArrayInputStream(((String) tuple.get(0)).getBytes()));
                    assertTrue(doc.getDocumentElement().getNodeName().equals(tagName));
                }
            }
        }
        assertEquals(matchingCount, count);
    }
}