com.yahoo.spaclu.data.extract.ExtractFeatureSpark.java Source code

Introduction

Here is the source code for com.yahoo.spaclu.data.extract.ExtractFeatureSpark.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.yahoo.spaclu.data.extract;

import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.AbstractMap.SimpleEntry;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.yahoo.spaclu.model.Settings;

/**
 * Format raw textual data to database format
 */
public class ExtractFeatureSpark {
    final static Logger sLogger = LoggerFactory.getLogger(ExtractFeatureSpark.class);

    public static List<Entry<String, String>> patternString = new LinkedList<Entry<String, String>>();
    static {
        patternString.add(new SimpleEntry<String, String>("(?i)(<strong.*?>)(.+?)(</strong>)", "$2"));

        // remove all the website patterns
        patternString.add(new SimpleEntry<String, String>(
                "(?i)(www\\.)(.+?)(\\.(com|net|gov|org|edu|co|biz|tv|info))", "$2"));
        patternString
                .add(new SimpleEntry<String, String>("(?i)(.+?)(\\.(com|net|gov|org|edu|co|biz|tv|info))", "$1"));
        // remove all punctuations
        patternString.add(new SimpleEntry<String, String>("\\p{Punct}", " "));
        // remove all digits
        patternString.add(new SimpleEntry<String, String>("\\d+", " "));
        // remove all duplicate spaces
        patternString.add(new SimpleEntry<String, String>(" +", " "));
    }

    public enum CBBingCombinedFields {
        CBBingCombined_pvid, //
        CBBingCombined_sec, //
        CBBingCombined_adRank, //
        CBBingCombined_page_position, //
        CBBingCombined_dude, //
        CBBingCombined_n_ads, //
        CBBingCombined_pagenum, //
        CBBingCombined_rawQuery, //
        CBBingCombined_canonicalQuery, //
        CBBingCombined_exactCanonicalQuery, //
        CBBingCombined_osqp, //
        CBBingCombined_bucket, //
        CBBingCombined_timestamp, //
        CBBingCombined_matchType, //
        CBBingCombined_title, //
        CBBingCombined_description, //
        CBBingCombined_displayUrl, //
        CBBingCombined_targetUrl, //
        CBBingCombined_sitelinkNum, //
        CBBingCombined_sitelinkClicks, //
        CBBingCombined_adClicks, //
        CBBingCombined_adSiteLinkClicks, //
        CBBingCombined_clkbMS, //
        CBBingCombined_ec, //
        CBBingCombined_cpcMl, //
        CBBingCombined_cpcSb, //
        CBBingCombined_bid, //
        CBBingCombined_rankScore, //
        CBBingCombined_relevanceScore, //
        CBBingCombined_fddEcpm, //
        CBBingCombined_quality_score, //
        CBBingCombined_dwell_time, //
        CBBingCombined_bcookie, //
        CBBingCombined_ip, //
        CBBingCombined_birthYear, //
        CBBingCombined_gender, //
        CBBingCombined_browser, //
        CBBingCombined_userAgent, //
        CBBingCombined_referUrl, //
        CBBingCombined_ucp, //
        CBBingCombined_scb, //
        CBBingCombined_effectiveFddThresholds, //
        CBBingCombined_iguid, //
        CBBingCombined_model, //
        CBBingCombined_device_type, //
        CBBingCombined_screen_width, //
        CBBingCombined_screen_height, //
        CBBingCombined_carrier, //
        CBBingCombined_srcspc, //
        CBBingCombined_advertiser_id, //
        CBBingCombined_campaign_id, //
        CBBingCombined_adgroup_id, //
        CBBingCombined_ad_id, //
        CBBingCombined_bidded_term_id, //
        CBBingCombined_apollo_advertiser_id, //
        CBBingCombined_apollo_campaign_id, //
        CBBingCombined_apollo_adgroup_id, //
        CBBingCombined_apollo_ad_id, //
        CBBingCombined_apollo_bidded_term_id, //
        CBBingCombined_NormalizedKeyword, //
        CBBingCombined_ClickabilityScore, //
        CBBingCombined_deviceCategory, //
        CBBingCombined_source, //
        SIZE
    }

    final static class LineFilter implements Function<String, String> {
        protected final List<Integer> featureIndices;

        LineFilter(List<Integer> featureIndices) {
            this.featureIndices = featureIndices;
        }

        @Override
        public String call(String line) {
            // StringTokenizer stk = new StringTokenizer(line, Settings.TAB);
            String[] tokens = line.split(Settings.TAB);

            Preconditions.checkArgument(tokens.length == CBBingCombinedFields.SIZE.ordinal(),
                    tokens.length + "\t" + CBBingCombinedFields.SIZE.ordinal());

            StringBuilder stringBuilder = new StringBuilder();
            Iterator<Integer> iter = featureIndices.iterator();
            while (iter.hasNext()) {
                String temp = tokens[iter.next()].toLowerCase();
                temp = StringEscapeUtils.unescapeXml(temp);
                Iterator<Entry<String, String>> patternIter = patternString.iterator();
                while (patternIter.hasNext()) {
                    Entry<String, String> pattern = patternIter.next();
                    temp = temp.replaceAll(pattern.getKey(), pattern.getValue());
                }
                stringBuilder.append(temp);
                stringBuilder.append(Settings.TAB);
            }

            return stringBuilder.toString().trim();
        }
    }

    public static void main(String[] args) throws IOException {
        ExtractFeatureOptions optionsFormatRawToDatabase = new ExtractFeatureOptions(args);

        String inputPathString = optionsFormatRawToDatabase.getInputPath();
        String outputPathString = optionsFormatRawToDatabase.getOutputPath();
        // String indexPathString = optionsFormatRawToDatabase.getIndexPath();
        int numberOfPartitions = optionsFormatRawToDatabase.getNumberOfPartitions();
        // int maxCutoffThreshold = optionsFormatRawToDatabase
        // .getMaximumCutoffThreshold();
        // int minCutoffThreshold = optionsFormatRawToDatabase
        // .getMinimumCutoffThreshold();

        /*
         * Set<String> excludingFeatureNames = new HashSet<String>();
         * excludingFeatureNames.add("login");
         * excludingFeatureNames.add("time"); excludingFeatureNames.add("day");
         * excludingFeatureNames.add("hms"); excludingFeatureNames.add("fail");
         */

        sLogger.info("Tool: " + ExtractFeatureSpark.class.getSimpleName());
        sLogger.info(" - input path: " + inputPathString);
        sLogger.info(" - output path: " + outputPathString);
        // sLogger.info(" - index path: " + indexPathString);
        sLogger.info(" - number of partitions: " + numberOfPartitions);
        // sLogger.info(" - maximum cutoff: " + maxCutoffThreshold);
        // sLogger.info(" - minimum cutoff: " + minCutoffThreshold);

        // Create a default hadoop configuration
        Configuration conf = new Configuration();

        // Parse created config to the HDFS
        FileSystem fs = FileSystem.get(conf);

        Path outputPath = new Path(outputPathString);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        SparkConf sparkConf = new SparkConf().setAppName(optionsFormatRawToDatabase.toString());

        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        List<Integer> listOfFeatureIndices = new LinkedList<Integer>();
        listOfFeatureIndices.add(CBBingCombinedFields.CBBingCombined_exactCanonicalQuery.ordinal());
        listOfFeatureIndices.add(CBBingCombinedFields.CBBingCombined_title.ordinal());
        listOfFeatureIndices.add(CBBingCombinedFields.CBBingCombined_description.ordinal());
        listOfFeatureIndices.add(CBBingCombinedFields.CBBingCombined_displayUrl.ordinal());
        listOfFeatureIndices.add(CBBingCombinedFields.CBBingCombined_targetUrl.ordinal());

        JavaRDD<String> rawLines = sc.textFile(inputPathString).repartition(numberOfPartitions);

        JavaRDD<String> tokenizedLines = rawLines.map(new LineFilter(listOfFeatureIndices));
        tokenizedLines.saveAsTextFile(outputPathString);

        /*
         * Iterator<String[]> strArrIter = tokenizedLines.collect().iterator();
         * while(strArrIter.hasNext()){
         * sLogger.info(Arrays.toString(strArrIter.next())); }
         */

        sc.stop();
    }

    /**
     * @deprecated
     */
    public static boolean writeToHDFS(Object object, String fileName) {
        // Create a default hadoop configuration
        Configuration conf = new Configuration();
        // Specifies a new file in HDFS.
        Path filenamePath = new Path(fileName);

        try {
            // Parse created config to the HDFS
            FileSystem fs = FileSystem.get(conf);

            // if the file already exists delete it.
            if (fs.exists(filenamePath)) {
                throw new IOException();
            }

            FSDataOutputStream fos = fs.create(filenamePath);
            ObjectOutputStream oos = new ObjectOutputStream(fos);
            oos.writeObject(object);
            fos.close();
            oos.close();
            return true;
        } catch (IOException ioe) {
            ioe.printStackTrace();
            return false;
        }
    }
}