Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.piggybank.test.storage; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.Properties; import org.apache.commons.lang.StringUtils; import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration; import org.apache.pig.builtin.mock.Storage; import org.apache.pig.builtin.mock.Storage.Data; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.test.Util; import org.apache.pig.tools.parameters.ParseException; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import static org.apache.pig.builtin.mock.Storage.resetData; import static org.apache.pig.builtin.mock.Storage.tuple; public class TestCSVExcelStorage { Properties props = new Properties(); ArrayList<String> testMsgs = new ArrayList<String>(); String testFileCommaName = "testFileComma.csv"; String testFileTabName = "testFileTab.csv"; String testStrComma = "John,Doe,10\n" + "Jane, \"nee, Smith\",20\n" + ",,\n" + "\"Mac \"\"the knife\"\"\",Cohen,30\n" + "\"Conrad\n" + "Emil\",Dinger,40\n" + "1st Field,\"A poem that continues\n" + "for several lines\n" + "do we\n" + "(even with \r)" + "handle that?\",Good,Fairy\n"; String[] testStrCommaArray = new String[] { "John,Doe,10", "Jane, \"nee, Smith\",20", ",,", "\"Mac \"\"the knife\"\"\",Cohen,30", "\"Conrad\nEmil\",Dinger,40", "Emil,\"\nDinger\",40", "Quote problem,\"My \"\"famous\"\"\nsong\",", "1st Field,\"A poem that continues\nfor several lines\ndo we\n(even with \r)handle that?\",Good,Fairy", }; @SuppressWarnings("serial") ArrayList<Tuple> testStrCommaYesMultilineResultTuples = new ArrayList<Tuple>() { { add(Util.createTuple(new String[] { "John", "Doe", "10" })); add(Util.createTuple(new String[] { "Jane", " nee, Smith", "20" })); add(Util.createTuple(new String[] { "", "", "" })); add(Util.createTuple(new String[] { "Mac \"the knife\"", "Cohen", "30" })); add(Util.createTuple(new String[] { "Conrad\nEmil", "Dinger", "40" })); add(Util.createTuple(new String[] { "Emil", "\nDinger", "40" })); add(Util.createTuple(new String[] { "Quote problem", "My \"famous\"\nsong", "" })); add(Util.createTuple(new String[] { "1st Field", "A poem that continues\nfor several lines\ndo we\n(even with \n)handle that?", "Good", "Fairy" })); } }; @SuppressWarnings("serial") ArrayList<Tuple> testStrCommaNoMultilineResultTuples = new ArrayList<Tuple>() { { add(Util.createTuple(new String[] { "John", "Doe", "10" })); add(Util.createTuple(new String[] { "Jane", " nee, Smith", "20" })); add(Util.createTuple(new String[] { "", "", "" })); add(Util.createTuple(new String[] { "Mac \"the knife\"", "Cohen", "30" })); add(Util.createTuple(new String[] { "Conrad" })); add(Util.createTuple(new String[] { "Emil,Dinger,40" })); // Trailing double quote after Emil eats rest of line add(Util.createTuple(new String[] { "Emil" })); add(Util.createTuple(new String[] { "Dinger,40" })); // Trailing double quote after Emil eats rest of line add(Util.createTuple(new String[] { "Quote problem", "My \"famous\"" })); add(Util.createTuple(new String[] { "song," })); add(Util.createTuple(new String[] { "1st Field", "A poem that continues" })); add(Util.createTuple(new String[] { "for several lines" })); add(Util.createTuple(new String[] { "do we" })); add(Util.createTuple(new String[] { "(even with " })); add(Util.createTuple(new String[] { ")handle that?,Good,Fairy" })); // Trailing double quote eats rest of line } }; String testStrTab = "John\tDoe\t50\n" + "\"Foo and CR last\n" + "bar.\"\t\t\n" + "Frank\tClean\t70"; String[] testStrTabArray = new String[] { "John\tDoe\t50", "\"Foo and CR last\nbar.\"\t\t", "Frank\tClean\t70" }; @SuppressWarnings("serial") ArrayList<Tuple> testStrTabYesMultilineResultTuples = new ArrayList<Tuple>() { { add(Util.createTuple(new String[] { "John", "Doe", "50" })); add(Util.createTuple(new String[] { "Foo and CR last\nbar.", "", "" })); add(Util.createTuple(new String[] { "Frank", "Clean", "70" })); } }; private static final String dataDir = "build/test/tmpdata/"; private static final String testFile = "csv_excel_data"; private PigServer pig; @Before public void setup() throws IOException { pig = new PigServer(ExecType.LOCAL); pig.getPigContext().getProperties().setProperty(MRConfiguration.MAP_MAX_ATTEMPTS, "1"); pig.getPigContext().getProperties().setProperty(MRConfiguration.REDUCE_MAX_ATTEMPTS, "1"); pig.getPigContext().getProperties().setProperty(MRConfiguration.JOB_END_NOTIFICATION_RETRY_INTERVAL, "100"); Util.deleteDirectory(new File(dataDir)); pig.mkdirs(dataDir); Util.createLocalInputFile(dataDir + testFile, new String[] { "int_field,long_field,float_field,double_field,chararray_field,bytearray_field", "1,10,2.718,3.14159,qwerty,uiop", "1,10,2.718,3.14159,,", "1,10,,3.15159,,uiop", "1,10,,3.15159,,uiop, moose", "1,,\"2.718\",,\"qwerty\",\"uiop\"", "1,,,,\"", "qwe", "rty\", uiop", "1,10,2.718,3.14159,\"abc\rdef\",uiop", "1,,,,\"qwe,rty\",uiop", "1,,,,\"q\"\"wert\"\"y\", uiop", "1,,,,qwerty,\"u\"\"io\"\"p\"" }); Util.createLocalInputFile(testFileCommaName, testStrCommaArray); Util.createLocalInputFile(testFileTabName, testStrTabArray); } @After public void cleanup() throws IOException { Util.deleteDirectory(new File(dataDir)); pig.shutdown(); } // Load a simple CSV file with no escapes or special options @Test public void testSimpleCsv() throws IOException { String inputFileName = "TestCSVExcelStorage-simple.txt"; Util.createLocalInputFile(inputFileName, new String[] { "foo,bar,baz", "fee,foe,fum" }); String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " + " as (a:chararray, b:chararray, c:chararray); "; Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("a"); Assert.assertEquals(Util.createTuple(new String[] { "foo", "bar", "baz" }), it.next()); } // Load a field with commas in it (escaped with quotes) @Test public void testQuotedCommas() throws IOException { String inputFileName = "TestCSVExcelStorage-quotedcommas.txt"; Util.createLocalInputFile(inputFileName, new String[] { "\"foo,bar,baz\"", "fee,foe,fum" }); String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " + " as (a:chararray, b:chararray, c:chararray); "; Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("a"); Assert.assertEquals(Util.createTuple(new String[] { "foo,bar,baz", null, null }), it.next()); Assert.assertEquals(Util.createTuple(new String[] { "fee", "foe", "fum" }), it.next()); } // Two quotes characters should be interpreted as a single literal quotes character @Test public void testQuotedQuotes() throws IOException { String inputFileName = "TestCSVExcelStorage-quotedquotes.txt"; Util.createLocalInputFile(inputFileName, new String[] { "\"foo,\"\"bar\"\",baz\"", "\"\"\"\"\"\"\"\"" }); String script = "a = load '" + inputFileName + "' using org.apache.pig.piggybank.storage.CSVExcelStorage() " + " as (a:chararray); "; Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("a"); Assert.assertEquals(Util.createTuple(new String[] { "foo,\"bar\",baz" }), it.next()); Assert.assertEquals(Util.createTuple(new String[] { "\"\"\"\"" }), it.next()); } // Handle newlines in fields @Test public void testMultiline() throws IOException { // Read the test file: String script = "a = LOAD '" + testFileCommaName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrCommaYesMultilineResultTuples, "a"); // Store the test file back down into another file using YES_MULTILINE: String testOutFileName = createOutputFileName(); script = "STORE a INTO '" + testOutFileName + "' USING " + "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX');"; pig.registerQuery(script); // Read it back out using YES_MULTILINE, and see whether it's still correct: script = "b = LOAD '" + testOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrCommaYesMultilineResultTuples, "b"); // Now read it back again, but multilines turned off: script = "c = LOAD '" + testOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrCommaNoMultilineResultTuples, "c"); // Store this re-read test file back down again, into another file using NO_MULTILINE: testOutFileName = createOutputFileName(); script = "STORE c INTO '" + testOutFileName + "' USING " + "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE', 'UNIX');"; pig.registerQuery(script); // Read it back in, again with NO_MULTILINE and see whether it's still correct: script = "d = LOAD '" + testOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrCommaNoMultilineResultTuples, "d"); } // Handle non-comma delimiters @Test public void testTabDelimiter() throws IOException { // Read the test file: String script = "e = LOAD '" + testFileTabName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrTabYesMultilineResultTuples, "e"); // Store the test file back down into another file using YES_MULTILINE: String testOutFileName = createOutputFileName(); script = "STORE e INTO '" + testOutFileName + "' USING " + "org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');"; pig.registerQuery(script); // Read it back out using YES_MULTILINE, and see whether it's still correct: script = "f = LOAD '" + testOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage('\t', 'YES_MULTILINE');"; pig.registerQuery(script); compareExpectedActual(testStrTabYesMultilineResultTuples, "f"); } private void compareExpectedActual(ArrayList<Tuple> theExpected, String theActualPigVarAlias) throws IOException { Iterator<Tuple> actualIt = pig.openIterator(theActualPigVarAlias); Iterator<Tuple> expIt = theExpected.iterator(); while (actualIt.hasNext()) { Tuple actual = actualIt.next(); if (!expIt.hasNext()) Assert.fail("The input contains more records than expected. First unexpected record: " + actual); Tuple expected = expIt.next(); // The following assert does not work, even if // the two tuples are identical in class (BinSedesTuple) // and content. We need to compare element by element: //assertEquals(expected, actual); for (int i = 0; i < expected.size(); i++) { String truthEl = (String) expected.get(i); String actualEl = new String(((DataByteArray) actual.get(i)).get()); Assert.assertEquals(truthEl, actualEl); } } } /* * Hack to get a temp file name to store data into. * The file must not exist when the caller subsequently * tries to write to it. In non-testing code this * would be an intolerable race condition. There's * likely a better way. */ private String createOutputFileName() throws IOException { File f = File.createTempFile("CSVExcelStorageTest", "csv"); f.deleteOnExit(); f.delete(); // On Windows this path will be C:\\..., which // causes errors in the Hadoop environment. Replace // the backslashes with forward slashes: return f.getAbsolutePath().replaceAll("\\\\", "/"); } // Comprehensive loader test: uses several datatypes; skips the header; // handles missing/extra fields; handles quotes, commas, newlines @Test public void load() throws IOException, ParseException { String schema = "i: int, l: long, f: float, d: double, c: chararray, b: bytearray"; pig.registerQuery("data = load '" + dataDir + testFile + "' " + "using org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_INPUT_HEADER') " + "AS (" + schema + ");"); Iterator<Tuple> data = pig.openIterator("data"); String[] expected = { // a header in csv_excel_data.csv should be skipped due to 'SKIP_INPUT_HEADER' being set in test_csv_storage_load.pig "(1,10,2.718,3.14159,qwerty,uiop)", // basic data types "(1,10,2.718,3.14159,,)", // missing fields at end "(1,10,,3.15159,,uiop)", // missing field in the middle "(1,10,,3.15159,,uiop)", // extra field (input data has "moose" after "uiop") "(1,,2.718,,qwerty,uiop)", // quoted regular fields (2.718, qwerty, and uiop in quotes) "(1,,,,\nqwe\nrty, uiop)", // newlines in quotes "(1,10,2.718,3.14159,abc\ndef,uiop)", // After LOAD \r => \n (PIG-4213) "(1,,,,qwe,rty,uiop)", // commas in quotes "(1,,,,q\"wert\"y, uiop)", // quotes in quotes "(1,,,,qwerty,u\"io\"p)" // quotes in quotes at the end of a line }; Assert.assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(data, "\n")); } // Comprehensive storer test for non-container fields: // uses several datatypes, writes a header, handle nulls, quotes, commas, newlines @Test public void storeScalarTypes() throws IOException, ParseException { String input = testFile; String schema = "int_field: int, long_field: long, float_field: float, double_field: double, " + "chararray_field: chararray, bytearray_field: bytearray"; String output = "csv_excel_scalar_output"; // Store data pig.registerQuery("data = load '" + dataDir + input + "' " + "using org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_INPUT_HEADER') " + "AS (" + schema + ");"); pig.store("data", dataDir + output, "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'WRITE_OUTPUT_HEADER')"); // Read it back pig.registerQuery("data = load '" + dataDir + output + "' " + "using TextLoader() as (line: chararray);"); Iterator<Tuple> data = pig.openIterator("data"); String[] expected = { // header should be written because we used the 'WRITE_OUTPUT_HEADER' argument "(int_field,long_field,float_field,double_field,chararray_field,bytearray_field)", "(1,10,2.718,3.14159,qwerty,uiop)", "(1,10,2.718,3.14159,,)", "(1,10,,3.15159,,uiop)", "(1,10,,3.15159,,uiop)", "(1,,2.718,,qwerty,uiop)", "(1,,,,\")", // since we are just using TextLoader for verification "(qwe)", // it treats the linebreaks as meaning separate records "(rty\", uiop)", // but as shown in the load() test, CSVExcelStorage will read these properly "(1,10,2.718,3.14159,\"abc)", "(def\",uiop)", "(1,,,,\"qwe,rty\",uiop)", "(1,,,,\"q\"\"wert\"\"y\", uiop)", "(1,,,,qwerty,\"u\"\"io\"\"p\")" }; Assert.assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(data, "\n")); } // Test that tuples/bags/maps are stored as strings @Test public void storeComplexTypes() throws IOException, ParseException { String input = "csv_excel_complex_input"; String schema = "a:(b:int,c:int),d:(e:int,f:(g:int,h:int)),i:{j:(k:int,l:int)},m:{n:(o:int,p:{q:(r:int,s:int)})},t:[int],u:[[int]]"; String output = "csv_excel_complex_output"; Util.createLocalInputFile(dataDir + input, new String[] { "(1,2)|(1,(2,3))|{(1,2),(3,4)}|{(1,{(2,3),(4,5)}),(6,{(7,8),(9,0)})}|[a#1,b#2]|[a#[b#1,c#2],d#[e#3,f#4]]", "(1,)|(1,(2,))|{(1,),(3,)}|{(1,{(,3),(,5)}),(6,{(7,),(9,)})}|[a#,b#2]|[a#[b#,c#2],d#]" }); pig.registerQuery( "data = load '" + dataDir + input + "' " + "using PigStorage('|')" + "AS (" + schema + ");"); pig.store("data", dataDir + output, "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'SKIP_OUTPUT_HEADER')"); pig.registerQuery("data = load '" + dataDir + output + "' " + "using TextLoader() as (line: chararray);"); Iterator<Tuple> data = pig.openIterator("data"); String[] expected = { "(\"(1,2)\",\"(1,(2,3))\",\"{(1,2),(3,4)}\",\"{(1,{(2,3),(4,5)}),(6,{(7,8),(9,0)})}\",\"{b=2, a=1}\",\"{d={f=4, e=3}, a={b=1, c=2}}\")", "(\"(1,)\",\"(1,(2,))\",\"{(1,),(3,)}\",\"{(1,{(,3),(,5)}),(6,{(7,),(9,)})}\",\"{b=2, a=null}\",\"{d=null, a={b=null, c=2}}\")" }; String[] expectedJDK8 = { "(\"(1,2)\",\"(1,(2,3))\",\"{(1,2),(3,4)}\",\"{(1,{(2,3),(4,5)}),(6,{(7,8),(9,0)})}\",\"{a=1, b=2}\",\"{a={b=1, c=2}, d={e=3, f=4}}\")", "(\"(1,)\",\"(1,(2,))\",\"{(1,),(3,)}\",\"{(1,{(,3),(,5)}),(6,{(7,),(9,)})}\",\"{a=null, b=2}\",\"{a={b=null, c=2}, d=null}\")" }; String actual = StringUtils.join(data, "\n"); Assert.assertTrue("Failed to match. Output was " + actual, StringUtils.join(expected, "\n").equals(actual) || StringUtils.join(expectedJDK8, "\n").equals(actual)); } // Test that STORE stores CR (\r) quoted/unquoted in yes_multiline/no_multiline @Test public void storeCR() throws IOException { ArrayList<Tuple> inputTuples = new ArrayList<Tuple>(); inputTuples.add(Storage.tuple(1, "text", "a line\rand another line to write")); String expected = "1,text,\"a line\rand another line to write\"\n"; String expectedNoMultiline = "1,text,a line\rand another line to write\n"; // Prepare the input using mock.Storage() since this will not interpret \r Data data = Storage.resetData(pig); data.set("inputTuples", inputTuples); // Test for quoted when YES_MULTILINE // Execute String testOut = dataDir + "csv_cr_quoted_output_yes_multiline"; String script = "A = load 'inputTuples' USING mock.Storage() as (f1:int, f2:chararray, f3:chararray);" + "STORE A INTO '" + testOut + "' USING " + "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX');"; Util.registerMultiLineQuery(pig, script); // Load result FileInputStream resultFile = new FileInputStream(testOut + "/part-m-00000"); byte[] actualBytes = new byte[resultFile.available()]; resultFile.read(actualBytes); resultFile.close(); String actual = new String(actualBytes); Assert.assertEquals(expected, actual); // Test for unquoted when NO_MULTILINE // Execute testOut = dataDir + "csv_cr_quoted_output_no_multiline"; script = "A = load 'inputTuples' USING mock.Storage() as (f1:int, f2:chararray, f3:chararray);" + "STORE A INTO '" + testOut + "' USING " + "org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'NO_MULTILINE', 'UNIX');"; Util.registerMultiLineQuery(pig, script); // Load result resultFile = new FileInputStream(testOut + "/part-m-00000"); actualBytes = new byte[resultFile.available()]; resultFile.read(actualBytes); resultFile.close(); actual = new String(actualBytes); Assert.assertEquals(expectedNoMultiline, actual); } // Test to validate that each CSV file gets the correct header if they are run at the same time (PIG-4689) @Test public void storeTwoFilesWithDifferentHeaders() throws IOException, ParseException { pig.setBatchOn(); // Very important to reproduce this bug Storage.Data data = resetData(pig); String fooOutFileName = createOutputFileName(); data.set("foo", "foo_1:chararray", tuple("A")); pig.registerQuery("foo = LOAD 'foo' USING mock.Storage();"); pig.registerQuery("STORE foo INTO '" + fooOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'WRITE_OUTPUT_HEADER');"); String barOutFileName = createOutputFileName(); data.set("bar", "bar_1:chararray, bar_2:chararray", tuple("B", "C")); pig.registerQuery("bar = LOAD 'bar' USING mock.Storage();"); pig.registerQuery("STORE bar INTO '" + barOutFileName + "' " + "USING org.apache.pig.piggybank.storage.CSVExcelStorage(',', 'YES_MULTILINE', 'UNIX', 'WRITE_OUTPUT_HEADER');"); pig.executeBatch(); // ----- pig.registerQuery("fooCsv = load '" + fooOutFileName + "' ;"); Iterator<Tuple> fooCsv = pig.openIterator("fooCsv"); String[] expectedFooCsv = { // header should be written because we used the 'WRITE_OUTPUT_HEADER' argument "(foo_1)", "(A)" }; Assert.assertEquals(StringUtils.join(expectedFooCsv, "\n"), StringUtils.join(fooCsv, "\n")); // ----- pig.registerQuery("barCsv = load '" + barOutFileName + "' ;"); Iterator<Tuple> barCsv = pig.openIterator("barCsv"); String[] expectedbarCsv = { // header should be written because we used the 'WRITE_OUTPUT_HEADER' argument "(bar_1,bar_2)", "(B,C)" }; Assert.assertEquals(StringUtils.join(expectedbarCsv, "\n"), StringUtils.join(barCsv, "\n")); } }