clustering.init.WordSepMapper.java Source code

Introduction

Here is the source code for clustering.init.WordSepMapper.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package clustering.init;

import clustering.Utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * Mapper class to prepare data.
 *
 * @author edwardlol
 *         Created by edwardlol on 2017/4/20.
 */
public class WordSepMapper extends Mapper<LongWritable, Text, Text, Text> {
    //~ Instance fields --------------------------------------------------------

    private Text outputKey = new Text();

    private Text outputValue = new Text();

    private Map<String, String> synonymsMap = new HashMap<>();

    private String splitter;

    //~ Methods ----------------------------------------------------------------

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);

        String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
        String extention = FileUtils.getExtension(fileName);

        switch (extention) {
        case "tsv":
            this.splitter = "\t";
            break;
        case "csv":
            this.splitter = ",";
            break;
        default:
            Configuration conf = context.getConfiguration();
            this.splitter = conf.get("column.splitter");
        }

        // TODO: 17-4-21 read from file
        // dicts
        this.synonymsMap.put("", "?");
        this.synonymsMap.put("?", "?");
        // 0901
        this.synonymsMap.put("(?:|?)", "?");
        this.synonymsMap.put(
                "(?:||||||)",
                "");
        this.synonymsMap.put("(?:|||)", "");
        this.synonymsMap.put("?", "");
        // 8703
        this.synonymsMap.put("5", "");
        this.synonymsMap.put("7", "");
        this.synonymsMap.put("(?:4maitc|4mat1c|4mat2c)", "4matic");
        this.synonymsMap.put("(?:ican-am|can-am)", "canam");
        this.synonymsMap.put("cfm0to", "cfmoto");
        this.synonymsMap.put("bmw", "?");
        this.synonymsMap.put("benz", "");
        this.synonymsMap.put("audi", "");
        this.synonymsMap.put("(?:mercecles|mercede)", "mercedes");
        this.synonymsMap.put("(?:ferraei|ferrair)", "ferrari");
        this.synonymsMap.put("", "");
        this.synonymsMap.put("", "?");
        this.synonymsMap.put("(?:?|?)", "");
        this.synonymsMap.put("", "");
        this.synonymsMap.put("?", "?");
        this.synonymsMap.put("", "");
        this.synonymsMap.put("", "");
        this.synonymsMap.put("(?:|)", "");
    }

    /**
     * Read the input file, extract the commodity info, and split the words
     * in g_name and g_model.
     *
     * @param key   position
     * @param value entry_id@@g_no@@code_ts@@g_name@@[g_model][@@other_columns]
     *              {@inheritDoc}
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String[] contents = value.toString().split(this.splitter);
        String name = replaceSynonyms(contents[3]);

        String nameAndModel = SepUtils.append(name) + "##";

        if (contents.length > 3) {
            String model = replaceSynonyms(contents[4]);
            nameAndModel += SepUtils.append(model);
        }

        this.outputKey.set(contents[0] + "@@" + contents[1]);
        this.outputValue.set(nameAndModel);
        context.write(this.outputKey, this.outputValue);
    }

    /**
     * Replace all the synonyms in the original sentence.
     */
    // TODO: 17-4-21 if this process takes too long, make it an independent step
    private String replaceSynonyms(String origin) {
        String result = origin;
        for (Map.Entry<String, String> entry : this.synonymsMap.entrySet()) {
            result = result.replaceAll(entry.getKey(), entry.getValue());
        }
        return result;
    }
}

// End WordSepMapper.java