org.apache.metron.dataloads.nonbulk.flatfile.SimpleFlatFileSummarizerTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.metron.dataloads.nonbulk.flatfile.SimpleFlatFileSummarizerTest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.metron.dataloads.nonbulk.flatfile;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.adrianwalker.multilinestring.Multiline;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.metron.dataloads.extractor.ExtractorHandler;
import org.apache.metron.dataloads.nonbulk.flatfile.importer.LocalSummarizer;
import org.apache.metron.dataloads.nonbulk.flatfile.location.Location;
import org.apache.metron.dataloads.nonbulk.flatfile.location.RawLocation;
import org.apache.metron.dataloads.nonbulk.flatfile.writer.InvalidWriterOutput;
import org.apache.metron.dataloads.nonbulk.flatfile.writer.Writer;
import org.apache.metron.stellar.common.utils.StellarProcessorUtils;
import org.junit.Assert;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;

public class SimpleFlatFileSummarizerTest {
    /**
     {
     "config" : {
       "columns" : {
         "rank" : 0,
         "domain" : 1
       },
       "value_transform" : {
         "domain" : "DOMAIN_REMOVE_TLD(domain)"
       },
       "value_filter" : "LENGTH(domain) > 0",
       "state_init" : "MULTISET_INIT()",
       "state_update" : {
         "state" : "MULTISET_ADD(state, domain)"
       },
       "state_merge" : "MULTISET_MERGE(states)",
       "separator" : ","
       },
       "extractor" : "CSV"
     }
     */
    @Multiline
    public static String stellarExtractorConfigLineByLine;

    /**
     {
     "config" : {
       "columns" : {
         "rank" : 0,
         "domain" : 1
       },
       "value_transform" : {
         "domain" : "DOMAIN_REMOVE_TLD(domain)"
       },
       "value_filter" : "LENGTH(domain) > 0",
       "state_init" : "MULTISET_INIT()",
       "state_update" : {
         "state" : "MULTISET_ADD(state, domain)"
       },
       "state_merge" : "MULTISET_MERGE(states)",
       "separator" : ","
       },
       "extractor" : "CSV",
       "inputFormat" : "WHOLE_FILE"
     }
     */
    @Multiline
    public static String stellarExtractorConfigWholeFile;

    public static List<String> domains = ImmutableList.of("google.com", "youtube.com", "facebook.com", "baidu.com",
            "wikipedia.org", "yahoo.com", "google.co.in", "reddit.com", "qq.com", "amazon.com", "taobao.com",
            "tmall.com", "twitter.com", "live.com", "vk.com", "google.co.jp", "instagram.com", "sohu.com",
            "sina.com.cn", "jd.com");

    public static String generateData() {
        List<String> tmp = new ArrayList<>();
        int i = 1;
        for (String d : domains) {
            tmp.add(i + "," + d);
        }
        return Joiner.on("\n").join(tmp);
    }

    @Test
    public void testArgs() throws Exception {
        String[] argv = { "-e extractor.json", "-o out.ser", "-l log4j", "-i input.csv", "-p 2", "-b 128", "-q" };

        Configuration config = new Configuration();
        String[] otherArgs = new GenericOptionsParser(config, argv).getRemainingArgs();

        CommandLine cli = SummarizeOptions.parse(new PosixParser(), otherArgs);
        Assert.assertEquals("extractor.json", SummarizeOptions.EXTRACTOR_CONFIG.get(cli).trim());
        Assert.assertEquals("input.csv", SummarizeOptions.INPUT.get(cli).trim());
        Assert.assertEquals("log4j", SummarizeOptions.LOG4J_PROPERTIES.get(cli).trim());
        Assert.assertEquals("2", SummarizeOptions.NUM_THREADS.get(cli).trim());
        Assert.assertEquals("128", SummarizeOptions.BATCH_SIZE.get(cli).trim());
    }

    public static class InMemoryLocation implements RawLocation {
        Map<String, String> inMemoryData;

        public InMemoryLocation(Map<String, String> inMemoryData) {
            this.inMemoryData = inMemoryData;
        }

        @Override
        public Optional<List<String>> list(String loc) throws IOException {
            if (loc.equals(".")) {
                ArrayList<String> ret = new ArrayList<>(inMemoryData.keySet());
                return Optional.of(ret);
            }
            return Optional.empty();
        }

        @Override
        public boolean exists(String loc) {
            return loc.equals(".") ? true : inMemoryData.containsKey(loc);
        }

        @Override
        public boolean isDirectory(String loc) throws IOException {
            return loc.equals(".") ? true : false;
        }

        @Override
        public InputStream openInputStream(String loc) throws IOException {
            return new ByteArrayInputStream(inMemoryData.get(loc).getBytes());
        }

        @Override
        public boolean match(String loc) {
            return exists(loc);
        }
    }

    public class MockSummarizer extends LocalSummarizer {
        Map<String, String> mockData;

        public MockSummarizer(Map<String, String> mockData) {
            this.mockData = mockData;
        }

        @Override
        protected List<Location> getLocationsRecursive(List<String> inputs, FileSystem fs) throws IOException {
            Set<Location> ret = new HashSet<>();
            for (String input : inputs) {
                if (input.equals(".")) {
                    for (String s : mockData.keySet()) {
                        ret.add(resolveLocation(s, fs));
                    }
                } else {
                    ret.add(resolveLocation(input, fs));
                }
            }
            return new ArrayList<>(ret);
        }

        @Override
        protected Location resolveLocation(String input, FileSystem fs) {
            return new Location(input, new InMemoryLocation(mockData));
        }
    }

    public static class PeekingWriter implements Writer {
        AtomicReference<Object> ref;

        public PeekingWriter(AtomicReference<Object> ref) {
            this.ref = ref;
        }

        @Override
        public void validate(Optional<String> output, Configuration hadoopConfig) {

        }

        @Override
        public void write(Object obj, Optional<String> output, Configuration hadoopConfig) throws IOException {
            ref.set(obj);
        }

        @Override
        public void write(byte[] obj, Optional<String> output, Configuration hadoopConfig) throws IOException {

        }
    }

    @Test
    public void testLineByLine() throws IOException, InvalidWriterOutput {
        testLineByLine(5);
        testLineByLine(1);
    }

    public void testLineByLine(final int numThreads) throws IOException, InvalidWriterOutput {
        ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfigLineByLine);
        LocalSummarizer summarizer = new MockSummarizer(ImmutableMap.of("input.csv", generateData()));
        final AtomicReference<Object> finalObj = new AtomicReference<>(null);
        EnumMap<SummarizeOptions, Optional<Object>> options = new EnumMap<SummarizeOptions, Optional<Object>>(
                SummarizeOptions.class) {
            {
                put(SummarizeOptions.INPUT, Optional.of("input.csv"));
                put(SummarizeOptions.BATCH_SIZE, Optional.of(5));
                put(SummarizeOptions.QUIET, Optional.of(true));
                put(SummarizeOptions.OUTPUT_MODE, Optional.of(new PeekingWriter(finalObj)));
                put(SummarizeOptions.OUTPUT, Optional.of("out"));
                put(SummarizeOptions.NUM_THREADS, Optional.of(numThreads));
            }
        };
        summarizer.importData(options, handler, new Configuration());
        String expr = "MAP_GET(DOMAIN_REMOVE_TLD(domain), s) > 0";
        for (String domain : domains) {
            Boolean b = (Boolean) StellarProcessorUtils.run(expr,
                    ImmutableMap.of("s", finalObj.get(), "domain", domain));
            Assert.assertTrue("Can't find " + domain, b);
        }
    }

    @Test
    public void testWholeFile() throws Exception {
        testWholeFile(5);
        testWholeFile(1);
    }

    public void testWholeFile(final int numThreads) throws IOException, InvalidWriterOutput {
        ExtractorHandler handler = ExtractorHandler.load(stellarExtractorConfigWholeFile);
        LocalSummarizer summarizer = new MockSummarizer(new HashMap<String, String>() {
            {
                for (String domain : domains) {
                    put(domain, "1," + domain);
                }
            }
        });
        final AtomicReference<Object> finalObj = new AtomicReference<>(null);
        EnumMap<SummarizeOptions, Optional<Object>> options = new EnumMap<SummarizeOptions, Optional<Object>>(
                SummarizeOptions.class) {
            {
                put(SummarizeOptions.INPUT, Optional.of("."));
                put(SummarizeOptions.BATCH_SIZE, Optional.of(5));
                put(SummarizeOptions.QUIET, Optional.of(true));
                put(SummarizeOptions.OUTPUT_MODE, Optional.of(new PeekingWriter(finalObj)));
                put(SummarizeOptions.OUTPUT, Optional.of("out"));
                put(SummarizeOptions.NUM_THREADS, Optional.of(numThreads));
            }
        };
        summarizer.importData(options, handler, new Configuration());
        String expr = "MAP_GET(DOMAIN_REMOVE_TLD(domain), s) > 0";
        for (String domain : domains) {
            Boolean b = (Boolean) StellarProcessorUtils.run(expr,
                    ImmutableMap.of("s", finalObj.get(), "domain", domain));
            Assert.assertTrue("Can't find " + domain, b);
        }
    }

}