Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package clustering.init; import clustering.Utils.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; import java.util.HashMap; import java.util.Map; /** * Mapper class to prepare data. * * @author edwardlol * Created by edwardlol on 2017/4/20. */ public class WordSepMapper extends Mapper<LongWritable, Text, Text, Text> { //~ Instance fields -------------------------------------------------------- private Text outputKey = new Text(); private Text outputValue = new Text(); private Map<String, String> synonymsMap = new HashMap<>(); private String splitter; //~ Methods ---------------------------------------------------------------- @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); String fileName = ((FileSplit) context.getInputSplit()).getPath().getName(); String extention = FileUtils.getExtension(fileName); switch (extention) { case "tsv": this.splitter = "\t"; break; case "csv": this.splitter = ","; break; default: Configuration conf = context.getConfiguration(); this.splitter = conf.get("column.splitter"); } // TODO: 17-4-21 read from file // dicts this.synonymsMap.put("", "?"); this.synonymsMap.put("?", "?"); // 0901 this.synonymsMap.put("(?:|?)", "?"); this.synonymsMap.put( "(?:||||||)", ""); this.synonymsMap.put("(?:|||)", ""); this.synonymsMap.put("?", ""); // 8703 this.synonymsMap.put("5", ""); this.synonymsMap.put("7", ""); this.synonymsMap.put("(?:4maitc|4mat1c|4mat2c)", "4matic"); this.synonymsMap.put("(?:ican-am|can-am)", "canam"); this.synonymsMap.put("cfm0to", "cfmoto"); this.synonymsMap.put("bmw", "?"); this.synonymsMap.put("benz", ""); this.synonymsMap.put("audi", ""); this.synonymsMap.put("(?:mercecles|mercede)", "mercedes"); this.synonymsMap.put("(?:ferraei|ferrair)", "ferrari"); this.synonymsMap.put("", ""); this.synonymsMap.put("", "?"); this.synonymsMap.put("(?:?|?)", ""); this.synonymsMap.put("", ""); this.synonymsMap.put("?", "?"); this.synonymsMap.put("", ""); this.synonymsMap.put("", ""); this.synonymsMap.put("(?:|)", ""); } /** * Read the input file, extract the commodity info, and split the words * in g_name and g_model. * * @param key position * @param value entry_id@@g_no@@code_ts@@g_name@@[g_model][@@other_columns] * {@inheritDoc} */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] contents = value.toString().split(this.splitter); String name = replaceSynonyms(contents[3]); String nameAndModel = SepUtils.append(name) + "##"; if (contents.length > 3) { String model = replaceSynonyms(contents[4]); nameAndModel += SepUtils.append(model); } this.outputKey.set(contents[0] + "@@" + contents[1]); this.outputValue.set(nameAndModel); context.write(this.outputKey, this.outputValue); } /** * Replace all the synonyms in the original sentence. */ // TODO: 17-4-21 if this process takes too long, make it an independent step private String replaceSynonyms(String origin) { String result = origin; for (Map.Entry<String, String> entry : this.synonymsMap.entrySet()) { result = result.replaceAll(entry.getKey(), entry.getValue()); } return result; } } // End WordSepMapper.java